Source code for elektronn.net.optimizer

# -*- coding: utf-8 -*-
# ELEKTRONN - Neural Network Toolkit
#
# Copyright (c) 2014 - now
# Max-Planck-Institute for Medical Research, Heidelberg, Germany
# Authors: Marius Killinger, Gregor Urban

import time
import numpy as np
import theano
import theano.tensor as T
import matplotlib.pyplot as plt
from scipy.optimize import fmin_l_bfgs_b
from collections import OrderedDict


[docs]class Optimizer(object): """ Optimizer Base Object, initialises generic optimizer variables Parameters ---------- model_obj: cnn-object Encapsulation of theano model (instead of giving X,Y etc. manually), all other arguments are retrieved from this object if they are ``None``. If an argument is not ``None`` it will override the value from the model X: symbolic input variable Data Y: symbolic output variable Target Y_aux: symbolic output variable Auxiliary masks/weights/etc. for the loss, type: list! top_loss: symbolic loss function: Requires (X, Y (,*Y_aux)) for compilation params: list of shared variables List of parameter arrays against which the loss is optimised Returns ------- Callable optimizer object: loss = Optimizer(X, Y (,*Y_aux)) performs one iteration """ def __init__(self, model_obj=None, X=None, Y=None, Y_aux=[], top_loss=None, params=None): self.t_init = time.time() self.optimizer_params = None self.step = None if model_obj is None: assert X is not None and Y is not None and top_loss is not None and params is not None, "Missing arguments!" self.gradients = T.grad(top_loss, params, disconnected_inputs="warn") else: if X is None: self.X = model_obj._x else: self.X = X if Y is None: self.Y = model_obj._y else: self.Y = Y if Y_aux == []: self.Y_aux = model_obj._y_aux else: self.Y_aux = Y_aux if params is None: self.params = model_obj.params else: self.params = params if top_loss is None: # no loss means, loss&grad must already be defined in the model self.gradients = model_obj._gradients self.top_loss = model_obj._loss else: self.gradients = T.grad(top_loss, params, disconnected_inputs="warn") assert len(self.params) > 0, "no params, call compileOutputFunctions() before calling compileOptimizer()!" if hasattr(model_obj, '_last_grads') and model_obj._last_grads != [] and model_obj._last_grads is not None: self.last_grads = model_obj._last_grads else: self.last_grads = [] for i, p in enumerate(self.params): if p in self.params[:i]: print "Detected shared param: param[%i]" % i else: self.last_grads.append(theano.shared(np.zeros(p.get_value().shape, dtype='float32'), name=p.name + str('_LG'), borrow=False)) if hasattr(model_obj, 'global_weightdecay'): self.weightdecay = model_obj.global_weightdecay else: self.weightdecay = 0 if hasattr(model_obj, '_loss_instance'): self.loss_instance = model_obj._loss_instance else: self.loss_instance = None if hasattr(model_obj, '_get_loss'): self.get_loss = model_obj._get_loss else: self._get_loss = theano.function([self.X, self.Y] + self.Y_aux, [self.top_loss, self.loss_instance]) if model_obj is not None: model_obj._last_grads = self.last_grads # share last grads in model model_obj.get_loss = self.get_loss self.model_obj = model_obj
[docs] def updateOptimizerParams(self, optimizer_params): """ Update the hyper-parameter dictionary """ self.optimizer_params.update(optimizer_params)
[docs] def get_loss(self, *args): """ [data, labels(, *aux)] --> [loss, loss_instance] loss_instance is the loss per instance (e.g. batch-item or pixel) """ loss, nloss_instance = self._get_loss(*args) return np.float32(loss), nloss_instance
def __call__(self, *args): """ Perform an update step [data, labels(, *aux)] --> [loss, loss_instance] loss_instance is the loss per instance (e.g. batch-item or pixel) """ ret = list(self.step(*args)) ret[0] = np.float32(ret[0]) # the scalar nll return ret
[docs] def compileGradients(self): """ Compile and return a function that returns list of gradients """ print "Compiling getGradients" getGradients = theano.function( [self.X, self.Y] + self.Y_aux, self.gradients, on_unused_input='warn') return getGradients
############################################################################################################## ####################################### Stochastic Gradient Descent ################################ ##############################################################################################################
[docs]class compileSGD(Optimizer): """ Stochastic Gradient Descent """ def __init__(self, optimizer_params, model_obj=None, X=None, Y=None, Y_aux=[], top_loss=None, params=None): print "Compiling SGD" super(compileSGD, self).__init__(model_obj, X, Y, Y_aux, top_loss, params) self.LR = optimizer_params['LR'] self.momentum = optimizer_params['momentum'] grad_updates = [] param_updates = [] for param_i, grad_i, last_grad_i in zip(self.params, self.gradients, self.last_grads): new_grad_i = grad_i + last_grad_i * self.momentum grad_updates.append((last_grad_i, new_grad_i)) # use this if you want to use the gradient magnitude # For no weight decay weightdecay is just 0 param_updates.append((param_i, param_i - (self.LR) * new_grad_i - self.LR * self.weightdecay * param_i)) assert len(grad_updates) == len(param_updates), str(len(grad_updates)) + " != " + str(len(param_updates)) # This updates last_grads with the current grad and returns the loss before any parameter change self.step = theano.function( [self.X, self.Y] + self.Y_aux, [self.top_loss, self.loss_instance], updates=param_updates + grad_updates, on_unused_input='warn') print " Compiling done - in %.3f s!" % (time.time() - self.t_init)
############################################################################################################## ####################################### Ada Grad ################################ ##############################################################################################################
[docs]class compileAdam(Optimizer): """ Stochastic Gradient Descent """ def __init__(self, optimizer_params, model_obj=None, X=None, Y=None, Y_aux=[], top_loss=None, params=None): print "Compiling Adam" super(compileAdam, self).__init__(model_obj, X, Y, Y_aux, top_loss, params) self.LR = optimizer_params['LR'] beta1 = optimizer_params['beta1'] beta2 = optimizer_params['beta2'] epsilon = optimizer_params['epsilon'] updates = OrderedDict() t_prev = theano.shared(np.float32(0.0)) t = t_prev + 1.0 a_t = self.LR * T.sqrt(1 - beta2**t) / (1 - beta1**t) for param, g_t in zip(self.params, self.gradients): value = param.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) m_t = beta1 * m_prev + (1 - beta1) * g_t v_t = beta2 * v_prev + (1 - beta2) * g_t**2 step = a_t * m_t / (T.sqrt(v_t) + epsilon) updates[m_prev] = m_t updates[v_prev] = v_t updates[param] = param - step updates[t_prev] = t self.step = theano.function( [self.X, self.Y] + self.Y_aux, [self.top_loss, self.loss_instance], updates=updates, on_unused_input='warn') print " Compiling done - in %.3f s!" % (time.time() - self.t_init)
############################################################################################################## ####################################### Resilient backPROPagation ################################# ##############################################################################################################
[docs]class compileRPROP(Optimizer): """ Resilient backPROPagation """ def __init__(self, optimizer_params, model_obj=None, X=None, Y=None, Y_aux=[], top_loss=None, params=None): print "Compiling RPROP..." super(compileRPROP, self).__init__(model_obj, X, Y, Y_aux, top_loss, params) self.LRs = [] RPROP_updates = [] # Initialise shared variables for the Training algos for i, para in enumerate(self.params): if para in self.params[:i]: print "Detected RNN or shared param @index =", i else: self.LRs.append(theano.shared( np.float32(optimizer_params['initial_update_size']) * np.ones(para.get_value().shape, dtype='float32'), name=para.name + str('_RPROP'), borrow=0)) print "RPROP: missing backtracking handling " ###TODO ??? for param_i, grad_i, last_grad_i, pLR_i in zip( self.params, self.gradients, self.last_grads, self.LRs): # Commented code on next 4 lines is theano-incapable and just illustration!!! # if ((last_grad_i*grad_i) < -1e-9): # sign-change & significant magnitude of last two gradients # pLR_i_new = pLR_i * (1 - np.float32(RPROP_penalty)) # decrease this LR # elif ((last_grad_i*grad_i) > 1e-11): # no sign-change & and last two gradients were sufficiently big # pLR_i_new = pLR_i * (1 + np.float32(RPORP_gain)) # increase this LR # capping RPROP-LR inside [1e-7,2e-3] RPROP_updates.append((pLR_i, T.minimum(T.maximum( pLR_i * (1 - np.float32(optimizer_params['penalty']) * ((last_grad_i * grad_i) < -1e-9) + np.float32(optimizer_params['gain']) * ((last_grad_i * grad_i) > 1e-11)), 1e-7 * T.ones_like(pLR_i)), 2e-3 * T.ones_like(pLR_i)))) RPROP_updates.append((param_i, param_i - pLR_i * grad_i / (T.abs_( grad_i) + 1e-6) - (self.weightdecay * param_i))) RPROP_updates.append((last_grad_i, grad_i)) self.step = theano.function([self.X, self.Y] + self.Y_aux, [self.top_loss, self.loss_instance], updates=RPROP_updates, on_unused_input='warn') print " Compiling done - in %.3f s!" % (time.time() - self.t_init)
############################################################################################################## ########################################### Conjugate Gradient #################################### ##############################################################################################################
[docs]class compileCG(Optimizer): """ Conjugate Gradient """ def __init__(self, optimizer_params, model_obj=None, X=None, Y=None, Y_aux=[], top_loss=None, params=None): print "Compiling CG" super(compileCG, self).__init__(model_obj, X, Y, Y_aux, top_loss, params) self.optimizer_params = optimizer_params self.direc = [] CG_updates_direc = [] CG_updates_grads = [] CG_updates = [] # params self.model_obj = model_obj # Initialise shared variables for the Training algos for para in self.params: para_shape = para.get_value().shape self.direc.append(theano.shared( np.zeros(para_shape, dtype='float32'), name=para.name + '_CG_direc', borrow=False)) ### Kickstart of CG, initialise first direction to current gradient ### for grad_i, last_grad_i, direc_i in zip(self.gradients, self.last_grads, self.direc): CG_updates_grads.append((last_grad_i, grad_i)) CG_updates_direc.append((direc_i, -grad_i)) # update direc & last-grad updates = CG_updates_grads + CG_updates_direc self.CG_kickstart = theano.function( [self.X, self.Y] + self.Y_aux, None, updates=updates, on_unused_input='ignore') ### Regular CG-step ### CG_updates_direc = [] # clear update-list CG_updates_grads = [] # clear update-list # Compute Polak-Ribiere coefficient b for updating search direction denom = theano.shared(np.float32(0)) num = theano.shared(np.float32(0)) for grad_i, last_grad_i in zip(self.gradients, self.last_grads): num = num + T.sum(-grad_i * (-grad_i + last_grad_i)) denom = denom + T.sum(last_grad_i * last_grad_i) coeff = num / denom coeff = T.max(T.stack([coeff, theano.shared(np.float32(0))])) # select # Search-direction and last-grad update for grad_i, last_grad_i, direc_i in zip(self.gradients, self.last_grads, self.direc): CG_updates_grads.append((last_grad_i, grad_i)) CG_updates_direc.append((direc_i, -grad_i + direc_i * coeff)) updates = CG_updates_grads + CG_updates_direc # if self.Y_aux is None: # self.CG_step = theano.function([self.X, self.Y], # coeff, updates=updates, on_unused_input='ignore') # else: self.CG_step = theano.function([self.X, self.Y] + self.Y_aux, coeff, updates=updates, on_unused_input='ignore') # Weights update (Line search), no input needed, as only params are changed delta = T.fscalar('delta') # used to parametrise the ray along we search (=0 at current params) self.t = theano.shared(np.float32(0)) # Internal update step indicator for param_i, search_direc_i in zip(self.params, self.direc): CG_updates.append((param_i, param_i + search_direc_i * delta)) CG_updates.append((self.t, self.t + delta)) self.CG_update_params = theano.function([delta], self.t + delta, updates=CG_updates) # Linear-Approximation (from shared last_(grad|direc)) linear_approx = theano.shared(np.float32(0)) for grad_i, last_direc_i in zip(self.last_grads, self.direc): linear_approx = linear_approx + T.sum(grad_i * last_direc_i) self.CG_linear_approx = theano.function([], linear_approx, updates=None) print " Compiling done - in %.3f s!" % (time.time() - self.t_init) def __call__(self, *args): # i.e. trainingStepCG """ Perform an update step [data, labels(, *aux)] --> [loss, loss_instance] loss_instance is the loss per instance (e.g. batch-item or pixel) """ self.CG_kickstart(*args) timeline = [] loss, loss_instance, t, count = self.lineSearch(*args) timeline.append([loss, t, 0, count]) self.t.set_value(np.float32(0)) # DBG: reset internal update-magnitude n_steps = self.optimizer_params['n_steps'] for i in xrange(n_steps - 1): if self.optimizer_params['only_descent']: self.CG_kickstart(*args) coeff = 0 else: # use actual CG coeff = self.CG_step(*args) loss, loss_instance, t, count = self.lineSearch(*args) self.t.set_value(np.float32(0)) timeline.append([loss, t, coeff, count]) if hasattr(self.model_obj, 'CG_timeline'): self.model_obj.CG_timeline.extend(timeline) return loss, loss_instance
[docs] def lineSearch(self, *args): """ Needed for CG """ loss_0, _ = self.get_loss(*args) loss_0 = np.float32(loss_0) linear_approx = self.CG_linear_approx() counter = 0 if linear_approx > 0: # if algorithm gets stuck, reset loss, loss_instance = self.get_loss(*args) return loss_0, loss_instance, 0, counter max_step = self.optimizer_params['max_step'] min_step = self.optimizer_params['min_step'] beta = self.optimizer_params['beta'] max_count = int(np.log(min_step / (max_step)) / np.log(beta)) # limit iterations to lower bound points = [] ### For Plotting t = max_step # The next search points DEcrement the search ray by the decaying negative factor delta # Thus parameters needn't to be reset before updating, we directly change # the parameters by the desired amount # The first search point: max_step along the current search direction self.CG_update_params(np.float32(max_step)) loss, loss_instance = self.get_loss(*args) loss = np.float32(loss) counter += 1 delta = max_step * (beta - 1.0) last_loss = 1000000 for i in xrange(max_count): chord = loss_0 + t * linear_approx * self.optimizer_params['alpha'] points.append([t, loss, chord]) # The second condition is a deviation from regular BT-LineSearch if (loss <= chord) or (loss > last_loss): break self.CG_update_params(np.float32(delta)) last_loss = loss loss, loss_instance = self.get_loss(*args) loss = np.float32(loss) delta = delta * beta t = t * beta counter += 1 if self.optimizer_params['show']: points.append([0, loss_0, loss_0]) points = np.array(points) plt.figure() plt.plot(points[:, 0], points[:, 1:]) plt.scatter(points[:, 0], points[:, 1]) plt.legend(('fu', 'chord')) plt.draw() plt.pause(0.0001) return loss, loss_instance, np.float(self.t.eval()), counter
############################################################################################################## ########################################### L-BFGS #################################### ##############################################################################################################
[docs]class compileLBFGS(Optimizer): """ L-BFGS (fast, full-batch method) References (cite one): R. H. Byrd, P. Lu and J. Nocedal. A Limited Memory Algorithm for Bound Constrained Optimization, (1995), SIAM Journal on Scientific and Statistical Computing, 16, 5, pp. 1190-1208. C. Zhu, R. H. Byrd and J. Nocedal. L-BFGS-B: Algorithm 778: L-BFGS-B, FORTRAN routines for large scale bound constrained optimization (1997), ACM Transactions on Mathematical Software, 23, 4, pp. 550 - 560. J.L. Morales and J. Nocedal. L-BFGS-B: Remark on Algorithm 778: L-BFGS-B, FORTRAN routines for large scale bound constrained optimization (2011), ACM Transactions on Mathematical Software, 38, 1. """ def __init__(self, optimizer_params, model_obj=None, X=None, Y=None, Y_aux=[], top_loss=None, params=None, debug=False): print "Compiling lBFGS" super(compileLBFGS, self).__init__(model_obj, X, Y, Y_aux, top_loss, params) self.optimizer_params = optimizer_params ret = [self.top_loss] + self.gradients self._loss_and_grad = theano.function([self.X, self.Y] + self.Y_aux, ret, on_unused_input='warn') self.params_values = [p.get_value() for p in self.params] self.shapes = [p.shape for p in self.params_values] # list of param-shapes self.sizes = map(np.prod, self.shapes) # list of param-sizes self.total_size = np.sum(self.sizes) # length of vectorized parameters self.params_vec = np.zeros(self.total_size, "float32") self.gradients_vec = np.zeros(self.total_size, "float32") self.debug = debug print " Compiling done - in %.3f s!" % (time.time() - self.t_init)
[docs] def vec2list(self, vec, target_list): pos = 0 for a, shape, size in zip(target_list, self.shapes, self.sizes): a[...] = vec[pos:pos + size].reshape(shape) pos += size
[docs] def list2vect(self, src_list, target_vec): pos = 0 for a, size in zip(src_list, self.sizes): target_vec[pos:pos + size] = np.asarray(a).flatten() pos += size
[docs] def loss_and_grad(self, params_vect_new, *args): """ internal use, updates self.params""" self.vec2list(params_vect_new, self.params_values) # Update param values in list and then on GPU for p, val in zip(self.params, self.params_values): p.set_value(val, borrow=False) ret = self._loss_and_grad(*args) loss, grads = np.float32(ret[0]), ret[1:] if len(grads) == 1: grads = grads[0] self.list2vect(grads, self.gradients_vec) grad_vec = np.asarray(self.gradients_vec, dtype="float64") if self.debug: print "loss_and_grad:: x_min,max =",np.min(args[0]),np.max(args[0]),\ "y_min,max =",np.min(args[1]),np.max(args[1]) print "loss_and_grad (av(g) =", np.mean( grad_vec), ", av(abs(g)) =", np.mean(np.abs(grad_vec)), ")" return loss, grad_vec
def __call__(self, *args): """ Perform an update step [data, labels(, *aux)] --> [loss, loss_instance] loss_instance is the loss per instance (e.g. batch-item or pixel) """ if self.debug: print "\nlbfgs->optimize::" print "__optimize:: x_min,max =", np.min(args[0]), np.max(args[0]), "y_min,max =",\ np.min(args[1]), np.max(args[1]) self.params_values = [p.get_value() for p in self.params] self.list2vect(self.params_values, self.params_vec) params, loss, info_dict = fmin_l_bfgs_b(func=self.loss_and_grad, x0=self.params_vec, args=args, **self.optimizer_params) aborted = info_dict["warnflag"] != 1 if aborted: print info_dict print "L-BFGS aborted!" return loss