Source code for elektronn2.neuromancer.optimiser

# -*- coding: utf-8 -*-
# ELEKTRONN2 Toolkit
# Copyright (c) 2015 Marius Killinger
# All rights reserved

from __future__ import absolute_import, division, print_function
from builtins import filter, hex, input, int, map, next, oct, pow, range, super, zip

import logging
import numpy as np
import theano.tensor as T

from . import graphutils
from . import variables

logger = logging.getLogger('elektronn2log')


[docs]class Optimiser(object):
    global_lr = variables.VariableParam(value=1,
                                        name='lr',
                                        dtype=graphutils.floatX)
    global_weight_decay = variables.VariableParam(value=0,
                                                  name='weight_decay',
                                                  dtype=graphutils.floatX)

    global_mom = variables.VariableParam(value=0.9,
                                          name='mom',
                                          dtype=graphutils.floatX)

[docs]    @classmethod
    def setlr(cls, val):
        """
        Set learning rate (global to all optimisers)
        """
        val = graphutils.as_floatX(val)
        cls.global_lr.set_value(val)

[docs]    @classmethod
    def setwd(cls, val):
        """
        Set weight decay parameter (global to all optimisers)
        """
        val = graphutils.as_floatX(val)
        cls.global_weight_decay.set_value(val)

[docs]    @classmethod
    def setmom(cls, val):
        """
        Set momentum parameter (global to all optimisers)
        """
        val = graphutils.as_floatX(val)
        cls.global_mom.set_value(val)

    def __init__(self, inputs, loss, grads, params, additional_outputs):
        if additional_outputs is None:
            additional_outputs = []

        self.meta_params = dict(lr=self.global_lr,
                                mom=self.global_mom,
                                wd=self.global_weight_decay)
        self.input = inputs
        self.output = [loss,] + additional_outputs
        self.loss   = loss
        self.params = params
        self.grads = grads
        self.step = None
        self.last_exec_time = None
        self.last_dir = []
        # the higher the index the older the params
        self.params_cycler = [self.alloc_shared_grads(name_suffix='_lp_%i'%i)
                              for i in range(3)]

[docs]    def alloc_shared_grads(self, name_suffix='_lg', init_val=0.0):
        """Returns new shared variables matching the shape of params/gradients"""
        grads = []
        for i, p in enumerate(self.params):
            name = p.name+name_suffix
            value = np.ones_like(p.get_value()) * graphutils.as_floatX(init_val)
            g = variables.VariableParam(value=value, name=name)
            grads.append(g)

        return grads

[docs]    def set_opt_meta_params(self, value_dict):
        """
        Update the meta-parameters via value dictionary
        """
        for k,v in value_dict.items():
            try:
                self.meta_params[k].set_value(v)
            except AttributeError:
                raise AttributeError

[docs]    def clear_last_dir(self, last_dir=None):
        if last_dir is None:
            last_dir = self.last_dir

        for d in last_dir:
            d.set_value(np.zeros(d.get_value().shape, dtype=d.dtype))


[docs]    def get_rotational_updates(self):
        updates = []
        for x in zip(self.params, *self.params_cycler):
            new_param, param_queue, = x[0], x[1:]
            for i in range(len(param_queue)-1, 0, -1):
                updates.append((param_queue[i], param_queue[i-1]))

            updates.append((param_queue[0], new_param))

        return updates


[docs]    def repair(self):
        self.clear_last_dir()
        for p, p_old in zip(self.params, self.params_cycler[-1]):
            p.set_value(p_old.get_value())


    def __call__(self, *args):
        """
        Perform an update step
        [data (,labels etc...)] --> [loss (, add. outputs...)]
        """
        ret = list(self.step(*args))
        ret[0] = graphutils.as_floatX(ret[0]) # the scalar loss
        self.last_exec_time = self.step.last_exec_time
        return ret



###############################################################################

[docs]class SGD(Optimiser):
    """
    SGD optimiser (See https://en.wikipedia.org/wiki/Stochastic_gradient_descent).
    """
    def __init__(self,  inputs, loss, grads, params, extra_updates,
                 additional_outputs=None):
        super(SGD, self).__init__(inputs, loss, grads, params,
                                  additional_outputs)
        self.last_dir     = self.alloc_shared_grads() # last direction os update

        updates = []
        for g, d, p in zip(self.grads, self.last_dir, self.params):
            new_d = g + self.global_mom * d
            if p.apply_reg:
                if p.apply_reg > 1:
                    multiplier = graphutils.as_floatX(p.apply_reg)
                    new_p = p - self.global_lr * \
                                (new_d + self.global_weight_decay * p * multiplier)
                else:
                    new_p = p - self.global_lr * \
                                (new_d + self.global_weight_decay * p)
            else:
                new_p = p - self.global_lr *  new_d

            updates.append((d, new_d))
            updates.append((p, new_p))

        updates.extend(extra_updates)
        updates.extend(self.get_rotational_updates())
        self.step = graphutils.make_func(self.input, self.output,
                                 updates=updates, name='SGD step')


[docs]class AdaGrad(Optimiser):
    """
    AdaGrad optimiser (See http://jmlr.org/papers/v12/duchi11a.html).

    Tries to favor making faster progress on parameters with usually small
    gradients (but does somehow ignore their actual direction, i.e. a parameter
    which has a lot of small gradients in the same direction and one that has
    many small gradients in opposite directions have both a high LR !
    """
    def __init__(self, inputs, loss, grads, params, extra_updates,
                 additional_outputs=None):
        super(AdaGrad, self).__init__(inputs, loss, grads, params,
                                      additional_outputs)

        self._init_done = False
        self.hs  = self.alloc_shared_grads('_h', init_val=0.0)
        updates = []
        for g, h, p in zip(self.grads, self.hs, self.params):
            new_h = h + T.square(g)
            if p.apply_reg: # apply to W but not b
                new_p = p - self.global_lr / T.sqrt(new_h) * \
                            (g + self.global_weight_decay * p)
            else:
                new_p = p - self.global_lr / T.sqrt(new_h) * g

            updates.append((h, new_h))
            updates.append((p, new_p))

        updates.extend(extra_updates)
        self.step = graphutils.make_func(self.input, self.output,
                                 updates=updates, name='AdaGrad step')

        # Create init_func to init h from one gradient evaluation
        updates = []
        for g, h in zip(self.grads, self.hs):
            new_h = h + T.square(g)
            updates.append((h, new_h))

        self.init_func = graphutils.make_func(self.input, [], updates=updates,
                                     name='AdaGrad initialiser')

    def __call__(self, *args):
        if not self._init_done:
            self.init_func(*args)
            self._init_done = True

        return super(AdaGrad, self).__call__(*args)


[docs]    def repair(self):
        super(AdaGrad, self).repair()
        self.clear_last_dir(self.hs)
        self._init_done = False

[docs]class AdaDelta(Optimiser):
    """
    AdaDelta optimiser (See https://arxiv.org/abs/1212.5701).

    Like AdaGrad, but accumulate squared only over window
    The delta part is some diagonal hessian approximation.
    Claims to be robust against sudden large gradients because then the
    denominator explodes, but this explosion is persistent for a while...
    (and this argumentation is true for any method accumulating squared grads).
    """
    def __init__(self,  inputs, loss, grads, params, extra_updates,
                 additional_outputs=None):
        super(AdaDelta, self).__init__(inputs, loss, grads, params,
                                       additional_outputs)
        self.squared_accum = self.alloc_shared_grads("_sq") # last directions update
        self.delta_accum = self.alloc_shared_grads("_d") # last directions update
        epsilon = 1e-5

        updates = []
        for g, s, d, p in zip(self.grads, self.squared_accum,
                              self.delta_accum, self.params):
            new_s = self.global_mom * s + (1.0 - self.global_mom) * T.square(g)
            direction = (g * T.sqrt(d + epsilon) / T.sqrt(s + epsilon))
            new_d = self.global_mom * d + (1 - self.global_mom) * T.square(direction)
            if p.apply_reg:
                if p.apply_reg > 1:
                    multiplier = graphutils.as_floatX(p.apply_reg)
                    new_p = p - self.global_lr * \
                                (direction + self.global_weight_decay * p * multiplier)
                else:
                    new_p = p - self.global_lr * \
                                (direction + self.global_weight_decay * p)
            else:
                new_p = p - self.global_lr *  direction

            updates.append((s, new_s))
            updates.append((d, new_d))
            updates.append((p, new_p))

        updates.extend(extra_updates)
        updates.extend(self.get_rotational_updates())
        self.step = graphutils.make_func(self.input, self.output,
                                 updates=updates, name='AdaDelta step')


[docs]    def repair(self):
        super(AdaDelta, self).repair()
        self.clear_last_dir(self.squared_accum)
        self.clear_last_dir(self.delta_accum)


[docs]class Adam(Optimiser):
    """
    Adam optimiser (See https://arxiv.org/abs/1412.6980v9).

    Like AdaGrad with windowed squared_accum and with momentum and a bias for
    the initial phase (t).
    The normalisation of Adam and AdaGrad (and RMSProp) does not damp but
    exaggerates sudden steep gradients (their squared_accum is small and their
    current grad is large).
    """
    def __init__(self,  inputs, loss, grads, params, extra_updates,
                 additional_outputs=None):
        super(Adam, self).__init__(inputs, loss, grads, params,
                                   additional_outputs)
        self.squared_accum = self.alloc_shared_grads("_sq") # last directions update
        self.momentum = self.alloc_shared_grads("_m") # last directions update
        epsilon = 1e-5

        # self.beta1 = variables.VariableParam(value=0.9, name='beta1',
        #                                      dtype=graphutils.floatX)
        self.beta2 = variables.VariableParam(value=0.999, name='beta2',
                                             dtype=graphutils.floatX)
        #self.meta_params['beta1'] = self.beta1
        self.meta_params['beta2'] = self.beta2

        t_old = variables.VariableParam(value=0.0, name='beta2',
                                        dtype=graphutils.floatX)

        updates = []
        t = 1 + t_old
        updates.append((t_old, t))
        factor = T.sqrt(1-self.beta2**t)/(1-self.global_mom**t)
        for g, s, m, p in zip(self.grads, self.squared_accum,
                              self.momentum, self.params):
            new_m = self.global_mom * m + (1.0 - self.global_mom) * g
            new_s = self.beta2      * s + (1.0 - self.beta2) * T.square(g)

            direction = factor * new_m / T.sqrt(new_s + epsilon)
            if p.apply_reg:
                if p.apply_reg > 1:
                    multiplier = graphutils.as_floatX(p.apply_reg)
                    new_p = p - self.global_lr * \
                                (direction + self.global_weight_decay * p * multiplier)
                else:
                    new_p = p - self.global_lr * \
                                (direction + self.global_weight_decay * p)
            else:
                new_p = p - self.global_lr *  direction

            updates.append((s, new_s))
            updates.append((m, new_m))
            updates.append((p, new_p))

        updates.extend(extra_updates)
        updates.extend(self.get_rotational_updates())
        self.step = graphutils.make_func(self.input, self.output,
                                 updates=updates, name='Adam step')

[docs]    def repair(self):
        super(Adam, self).repair()
        self.clear_last_dir(self.squared_accum)
        self.clear_last_dir(self.momentum)

[docs]class CG(Optimiser):
    pass