# -*- coding: utf-8 -*-
# ELEKTRONN2 Toolkit
# Copyright (c) 2015 Marius Killinger and Philipp J. Schubert
# All rights reserved
from __future__ import absolute_import, division, print_function
from builtins import filter, hex, input, int, map, next, oct, pow, range, super, zip
import logging
import time
from functools import reduce
import numpy as np
import theano
import theano.tensor as T
from ..config import config
from . import computations
from .variables import VariableWeight, ConstantParam, VariableParam
from .graphutils import floatX, TaggedShape, as_floatX
from .node_basic import Node, Concat, Add
logger = logging.getLogger('elektronn2log')
__all__ = ['Perceptron', 'Conv', 'UpConv', 'Crop', 'LSTM',
'FragmentsToDense', 'Pool', 'Dot', 'FaithlessMerge',
'GRU', 'LRN', 'AutoMerge', 'UpConvMerge', 'Pad']
################################################################################
### TODO gradnet stuff anpassen von Conv layer in den anderen layers? Vlt nicht jetzt...
class NeuralLayer(Node):
"""
Dummy class to add parameter initialisation methods for neural layers.
"""
def _register_param(self, param, shape, name, init_kwargs=None,
apply_train=False, apply_reg=False):
"""
Create parameter, set parameter as attribute and add to self.params if
not shared from another Layer.
Parameters
----------
param: None or np.ndarray or T.Variable or list
Possible forms of ``param``:
* Passing ``None`` creates new parameter with default
initialisation.
* Passing a np.ndarray creates new parameter with the values
of the array as initialisation.
* A shared parameter is created by passing a T.Variable as
``param``.
* A constant parameter is created by passing [np.ndarray, 'const']
as ``param``.
This parameter cannot be changed (no set_value) but makes the
compiled function faster.
shape: tuple
Shape of the new the parameter (VariableWeight).
name: str
Parameter name.
init_kwargs
kwargs for utils.initialisation.
apply_train: bool
Train flag of the new parameter (VariableWeight).
apply_reg: bool
Regularisation flag of the new parameter (VariableWeight).
"""
add_to_params = True
if self.name=='':
p_name = '<%s%s>'%(name, tuple(shape))
else:
p_name = '<%s_%s%s>'%(self.name, name, tuple(shape))
# create new trainable by initialisation
if param is None:
p = VariableWeight(shape=shape,
init_kwargs=init_kwargs,
name=p_name,
apply_train=apply_train,
apply_reg=apply_reg,)
# create new trainable from values
elif isinstance(param, np.ndarray):
if param.shape!=tuple(shape):
if not (param.ndim==0 and shape==(1,)):
raise ValueError("Shape mismatch. Required %s, given %s"\
%(shape, param.shape))
p = VariableWeight(value=param,
name=p_name,
apply_train=apply_train,
apply_reg=apply_reg,
dtype=floatX)
# share a variable from elsewhere, not trainable
elif isinstance(param, T.Variable): # (elektronn2.tensor.variables are T.Variable)
try:
sh = param.get_value().shape
if sh!=tuple(shape):
raise ValueError("Shape mismatch. Required %s, given %s" \
% (shape, param.shape))
except AttributeError:
logger.warning("Could not check correct shape of given weight %s, "
"make sure it has shape %s" %(param, shape))
p = param
add_to_params = False
# create constant variable (or explicitly trainable
elif isinstance(param, (list, tuple)):
fail = False
if not isinstance(param[0] , np.ndarray):
fail = True
if param[0].shape!=tuple(shape):
raise ValueError("Shape mismatch. Required %s, given %s"\
%(shape, param[0].shape))
if param[1] == 'const':
value = as_floatX(param[0])
p = ConstantParam(value, p_name)
elif param[1] == 'trainable':
value = as_floatX(param[0])
p = VariableWeight(value=value,
name=p_name,
apply_train=True,
apply_reg=apply_reg)
else:
fail = True
if fail:
raise ValueError("If a parameter is passed as a list, the "
"first entry must contain the parameter "
"value (np.ndarray) and the second entry "
"must be either 'const' or 'trainable' "
"to indicate whether this param is "
"trainable. Got [%s, %s]" \
%(type(param[0]), param[1]))
else:
raise ValueError("Parameter %s must be either <np.ndarray>, "
"<theano.TensorVariable>, a tuple or None "
"(to create new param)" %(name,))
setattr(self, name, p) #
if add_to_params:
self.params[name] = p
else:
logger.debug("Sharing theano variable %s. This parameter is not added to self.params" %(p,))
def _setup_params(self, w_sh, w, b, gamma, mean, std, dropout_rate,
pool_shape=None, gradnet_rate=None):
"""
Register each parameter, choose appropriate initialisation.
"""
# Dot/Conv/Bias Parameters #############################################
self.w = None
# TODO: Pass w_init mode from layer to setup_params
if config.use_ortho_init or isinstance(self, GRU) or isinstance(self, LSTM):
w_init = dict(scale='glorot', mode='ortho', pool=pool_shape,
spatial_axes=self.spatial_axes)
else:
w_init = dict(scale='glorot', mode='normal', pool=pool_shape,
spatial_axes=self.spatial_axes)
self._register_param(w, w_sh, 'w', init_kwargs=w_init,
apply_train=True, apply_reg=True)
activation_func = self.activation_func
n_f = self.n_f
self.b = None
if isinstance(self, GRU):
b_sh=(3 * n_f, )
elif isinstance(self, LSTM):
b_sh = (4 * n_f, )
else:
b_sh=(n_f,)
if activation_func=='relu' or activation_func.startswith("maxout"):
norm = 1.0
if len(w_sh) > 2:
fov = 1
for i in self.spatial_axes:
fov = fov * w_sh[i]
norm = fov
b_init=dict(scale=1.0/norm, mode='const')
elif activation_func=='sigmoid':
b_init=dict(scale=0.5, mode='const')
elif activation_func=='prelu':
norm = 1.0
if len(w_sh) > 2:
fov = 1
for i in self.spatial_axes:
fov = fov * w_sh[i]
norm = fov
b_init=dict(scale=1.0/norm, mode='prelu')
if isinstance(self, GRU):
b_sh=(3 * n_f, 2)
elif isinstance(self, LSTM):
b_sh = (4 * n_f, 2)
else:
b_sh=(n_f, 2)
else: # all other activations
b_init=dict(scale=1e-6, mode='fix-uni')
self._register_param(b, b_sh, 'b', init_kwargs=b_init,
apply_train=True, apply_reg=False)
# Batch Normalisation ##################################################
batch_normalisation = self.batch_normalisation
if batch_normalisation in ['train', 'fadeout']:
# mean and std are created as TensorVariables in _calc_output
self.gamma = None
sh = (n_f,)
g_init =dict(scale=1.0, mode='const')
self._register_param(gamma, sh, 'gamma', init_kwargs=g_init,
apply_train=True, apply_reg=3.0) ###TODO maybe even stronger reg for this?
if mean is not None or std is not None:
raise ValueError("Cannot pass mean and std for training, they "
"are computed in the theano graph.")
# create mean and std for training to accumulate running avgs
self.mean = None
m_init =dict(scale=0.0, mode='const')
self._register_param(None, sh, 'mean', init_kwargs=m_init)
self.std = None
s_init =dict(scale=1.0, mode='const')
self._register_param(None, sh, 'std', init_kwargs=s_init)
elif batch_normalisation=='predict':
sh = (n_f,)
self.gamma = None
g_init =dict(scale=1.0, mode='const')
self._register_param(gamma, sh, 'gamma', init_kwargs=g_init)
self.mean = None
m_init =dict(scale=0.0, mode='const')
self._register_param(mean, sh, 'mean', init_kwargs=m_init)
self.std = None
s_init =dict(scale=1.0, mode='const')
self._register_param(std, sh, 'std', init_kwargs=s_init)
else:
if batch_normalisation is not False:
raise ValueError("Unknown value %s for batchnormalisation" %batch_normalisation)
# Dropout ##############################################################
self.dropout_rate = None
if dropout_rate:
value = as_floatX(dropout_rate)
self._register_param(value, (1,), 'dropout_rate')
# GradNet ##############################################################
self.gradnet_rate = None
if gradnet_rate:
value = as_floatX(gradnet_rate)
self._register_param(value, (1,), 'gradnet_rate')
###############################################################################
[docs]class Perceptron(NeuralLayer):
"""
Perceptron Layer.
Parameters
----------
parent: Node or list of Node
The input node(s).
n_f: int
Number of filters (nodes) in layer.
activation_func: str
Activation function name.
flatten: bool
batch_normalisation: str or None
Batch normalisation mode.
Can be False (inactive), "train" or "fadeout".
dropout_rate: float
Dropout rate (probability that a node drops out in a training step).
name: str
Perceptron name.
print_repr: bool
Whether to print the node representation upon initialisation.
w: np.ndarray or T.TensorVariable
Weight matrix.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
b: np.ndarray or T.TensorVariable
Bias vector.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
gamma
(For batch normalisation) Initialises gamma parameter.
mean
(For batch normalisation) Initialises mean parameter.
std
(For batch normalisation) Initialises std parameter.
gradnet_mode
""" # TODO: Write docs on batch normalisation modes.
# TODO: gradnet_mode seems to be unused. Should it be removed?
def __init__(self, parent, n_f, activation_func='relu',
flatten=False, batch_normalisation=False, dropout_rate=0,
name="dot", print_repr=True, w=None, b=None, gamma=None,
mean=None, std=None, gradnet_mode=None):
super(Perceptron, self).__init__(parent, name, print_repr)
self.n_f = n_f
self.activation_func = activation_func
self.batch_normalisation = batch_normalisation
self.gradnet_mode = gradnet_mode
self.axis = parent.shape.tag2index('f') #retrieve feature shape's index
self.flatten = flatten
self.spatial_axes = parent.shape.spatial_axes
if flatten:
n_in = parent.shape.stripbatch_prod
else:
n_in = parent.shape['f'] #retrieve feature shape
w_sh = (n_in, n_f)
self._setup_params(w_sh, w, b, gamma, mean, std, dropout_rate)
def _make_output(self):
"""
Computation of Theano output.
"""
if self.flatten:
if self.axis is not 1:
raise NotImplementedError("Cannot flatten tensor for "
"Perceptron layer when batchsize is "
"not on first axis")
input_tensor = self.parent.output.flatten(2)
pattern = ['x', 0]
else:
input_tensor = self.parent.output
pattern = ['x' for i in input_tensor.shape]
pattern[self.axis] = 0
activation_func = self.activation_func
if activation_func.startswith("maxout"):
r=int(activation_func.split(" ")[1])
assert r>=2
self.n_f /= r
if activation_func=='prelu':
b = self.b[:,0].dimshuffle(pattern)
b1 = self.b[:,1].dimshuffle(pattern)
else:
b = self.b.dimshuffle(pattern)
b1 = None
lin_output = computations.dot(input_tensor, self.w, self.axis)
if self.batch_normalisation in ['train', 'fadeout']:
mean = computations.apply_except_axis(
lin_output,self.axis, T.mean).dimshuffle(pattern)
std = computations.apply_except_axis(
lin_output,self.axis, T.std).dimshuffle(pattern) + 1e-6
gamma = self.gamma.dimshuffle(pattern)
if self.batch_normalisation=='fadeout':
logger.warning("Batch normalisation mode 'fadeout' does not "
"work for less than 50%...")
mean = self.gradnet_rate * mean
std = self.gradnet_rate * std + (1-self.gradnet_rate) * 1.0
gamma = self.gradnet_rate * gamma
self.mean.updates = (self.mean,
0.9995 * self.mean + 0.0005 * T.extra_ops.squeeze(mean))
self.std.updates = (self.std,
0.9995 * self.std + 0.0005 * T.extra_ops.squeeze(std))
elif self.batch_normalisation=='predict':
mean = self.mean.dimshuffle(pattern)
std = self.std.dimshuffle(pattern)
gamma = self.gamma.dimshuffle(pattern)
else:
mean = 0
std = 1
gamma = 1
lin_output = (gamma / std) * lin_output + b - (gamma * mean / std)
lin_output = computations.apply_activation(lin_output, activation_func, b1)
if self.dropout_rate:
rng = T.shared_randomstreams.RandomStreams(int(time.time()))
p = 1 - self.dropout_rate
dropout_gate = rng.binomial(size=(self.n_f,), n=1, p=p,
dtype=theano.config.floatX)
dropout_gate *= 1.0 / p
lin_output = lin_output * dropout_gate.dimshuffle(pattern)
self.output = lin_output
def _calc_shape(self):
"""
Calculate shape from parent shape and n_f and set it as self.shape.
"""
sh = self.parent.shape
if self.flatten:
self.shape = TaggedShape((sh['b'], self.n_f), 'b,f')
else:
self.shape = sh.updateshape('f', self.n_f)
def _calc_comp_cost(self):
"""
Calculate abstract computational cost from parent shape and n_f and
set it as self.computational_cost.
"""
n = self.parent.shape.stripnone_prod
self.computational_cost = n * self.n_f
[docs] def make_dual(self, parent, share_w=False, **kwargs):
"""
Create the inverse of this ``Perceptron``.
Most options are the same as for the layer itself.
If ``kwargs`` are not specified, the values of the primal
layers are re-used and new parameters are created.
Parameters
----------
parent: Node
The input node.
share_w: bool
If the weights (``w``) should be shared from the primal layer.
kwargs: dict
kwargs that are passed through to the constructor of the inverted
Perceptron (see signature of ``Perceptron``).
``n_f`` is copied from the existing node on
which ``make_dual`` is called.
Every other parameter can be changed from the original
``Perceptron``'s defaults by specifying it in ``kwargs``.
Returns
-------
Perceptron
The inverted perceptron layer.
"""
if self.flatten:
raise NotImplementedError("Cannot make dual Layer for flattened "
"Perceptron layer.")
dropout_rate = 0.0 if not self.dropout_rate else self.dropout_rate.get_value()
defaults = dict(activation_func=self.activation_func,
batch_normalisation=self.batch_normalisation,
dropout_rate=dropout_rate,
name=self.name+'.T',
print_repr=self._print_repr,
w=None, b=None,gamma=None, mean=None, std=None)
defaults.update(kwargs)
kwargs = defaults
if share_w:
if kwargs['w'] is not None:
logger.debug("Ignoring passed w because w is shared from primal Layer.")
kwargs['w'] = self.w.T
n_f = self.parent.shape['f'] # This is the output of the dual Layer
if self.n_f is not parent.shape['f']: # input of dual Layer #q: Shouldn't this be "!=", instead of "is not"?
raise ValueError("Cannot make dual layer of:\n"
"%s\n"
"with input: %s!\n"
"The output shape of the input for the dual layer "
"must match the the input shape of the primal layer."\
%(self, parent))
return Perceptron(parent, n_f, **kwargs)
def __repr__(self):
s = super(NeuralLayer, self).__repr__()
s += "\n"
s += " n_f=%i, " %(self.n_f,)
s += "act='%s', " %(self.activation_func,)
if self.flatten:
s += "input was flattened, "
if self.dropout_rate:
s += "dropout rate = %.1f, "%(self.dropout_rate.get_value())
if self.batch_normalisation:
s += "BN in '%s' mode "%(self.batch_normalisation,)
return s
Dot = Perceptron
###############################################################################
[docs]class Conv(Perceptron):
"""
Convolutional layer with subsequent pooling.
Examples
--------
Examples for constructing convolutional neural networks can be found
in examples/neuro3d.py and examples/mnist.py.
Parameters
----------
parent: Node
The input node.
n_f: int
Number of features.
filter_shape: tuple
Shape of the convolution filter kernels.
pool_shape: tuple
Shape of max-pooling to be applied after the convolution.
``None`` (default) disables pooling along all axes.
conv_mode: str
Possible values:
* "valid": Only apply filter to complete patches of the image.
Generates output of shape: image_shape -filter_shape + 1.
* "full": Zero-pads image to multiple of filter shape to generate
output of shape: image_shape + filter_shape - 1.
* "same": Zero-pads input image so that the output shape
is equal to the input shape
(Only supported for odd filter sizes).
activation_func: str
Activation function name.
mfp: bool
Whether to apply Max-Fragment-Pooling in this Layer.
batch_normalisation: str or False
Batch normalisation mode.
Can be False (inactive), "train" or "fadeout".
dropout_rate: float
Dropout rate (probability that a node drops out in a training step).
name: str
Layer name.
print_repr: bool
Whether to print the node representation upon initialisation.
w: np.ndarray or T.TensorVariable
Weight matrix.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
b: np.ndarray or T.TensorVariable
Bias vector.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
gamma
(For batch normalisation) Initialises gamma parameter.
mean
(For batch normalisation) Initialises mean parameter.
std
(For batch normalisation) Initialises std parameter.
gradnet_mode
invalidate_fov: bool
Overrides the computed ``fov`` with an invalid value to
force later recalculation (experimental).
"""
def __init__(self, parent, n_f, filter_shape, pool_shape=None,
conv_mode='valid', activation_func='relu',
mfp=False, batch_normalisation=False, dropout_rate=0,
name="conv", print_repr=True, w=None, b=None, gamma=None,
mean=None, std=None, gradnet_mode=None, invalidate_fov=False):
super(Perceptron, self).__init__(parent, name, print_repr)
self.n_f = n_f
self.filter_shape = filter_shape
self.pool_shape = pool_shape
self.conv_mode = conv_mode
self.activation_func = activation_func
self.batch_normalisation = batch_normalisation
self.gradnet_mode = gradnet_mode
self.mfp = mfp
self.strides = parent.shape.strides
self.mfp_offsets = parent.shape.mfp_offsets
self.axis = parent.shape.tag2index('f') #retrieve feature shape's index
self.axis_order = None
self.invalidate_fov = invalidate_fov
if pool_shape is None: # Default to no pooling
pool_shape = tuple([1 for _ in filter_shape]) # e.g. (1, 1, 1) for the 3D case
self.pool_shape = pool_shape
self.spatial_axes = self.parent.shape.spatial_axes
conv_dim = len(self.spatial_axes)
x_dim = len(self.parent.shape)
if len(self.spatial_axes)!=len(filter_shape) or \
len(filter_shape)!=len(pool_shape):
raise ValueError("The filter_shape dimensionality (%i), the number "
"of spatial dimensions in the input (%i) and "
"the dimensionality of pool_shape (%i) differ! "
"Use filter size 1 on axes which should not be "
"convolved."\
%(len(filter_shape), conv_dim, len(pool_shape)))
n_in = parent.shape['f'] #retrieve feature shape
fail = False
if conv_dim==1:
if x_dim!=3 or self.spatial_axes!=[2]:
fail = True
w_sh = [n_f, n_in] + list(filter_shape)
elif conv_dim==2:
if x_dim!=4 or self.spatial_axes!=[2,3]:
fail = True
w_sh = [n_f, n_in] + list(filter_shape)
elif conv_dim==3:
if x_dim!=5:
fail = True
if self.spatial_axes==[2,3,4]:
self.axis_order = 'dnn'
w_sh = [n_f, n_in] + list(filter_shape)
elif self.spatial_axes==[1,3,4]:
self.axis_order = 'theano'
w_sh = [n_f, filter_shape[0], n_in] + list(filter_shape[1:])
else:
fail = True
if fail:
raise NotImplementedError("Cannot convolve non-standard shapes / axis orders. "
"Implement reshaping before conv "
"and re-reshaping after!")
self.conv_dim = conv_dim
self.w_sh = w_sh
gradnet_rate = 1.0 if gradnet_mode else None
self._setup_params(w_sh, w, b, gamma, mean, std, dropout_rate,
pool_shape, gradnet_rate)
def _make_output(self):
"""
Computation of Theano output.
"""
input_tensor = self.parent.output
input_shape = list(self.parent.shape)
pattern = ['x' for i in input_tensor.shape]
pattern[self.axis] = 0
activation_func = self.activation_func
if activation_func.startswith("maxout"):
r=int(activation_func.split(" ")[1])
assert r>=2
self.filter_shape /= r
if activation_func=='prelu':
b = self.b[:,0].dimshuffle(pattern)
b1 = self.b[:,1].dimshuffle(pattern)
else:
b = self.b.dimshuffle(pattern)
b1 = None
lin_output = computations.conv(input_tensor, self.w, self.axis_order,
border_mode=self.conv_mode,
x_shape=input_shape, w_shape=self.w_sh)
if self.mfp:
if self.input_nodes[0].shape['b']!=1:
raise ValueError("For MFP the batchsize of the raw image input must be 1.")
lin_output, offsets_new, strides_new = computations.fragmentpool(lin_output,
self.pool_shape,
self.mfp_offsets,
self.strides,
self.spatial_axes)
self.mfp_offsets = offsets_new
self.strides = strides_new
else:
lin_output = computations.pooling(lin_output, self.pool_shape, self.spatial_axes)
self.strides = np.multiply(self.pool_shape, self.strides)
if self.batch_normalisation in ['train', 'fadeout']:
mean = computations.apply_except_axis(
lin_output,self.axis, T.mean).dimshuffle(pattern)
std = computations.apply_except_axis(
lin_output,self.axis, T.std).dimshuffle(pattern) + 1e-6
gamma = self.gamma.dimshuffle(pattern)
if self.batch_normalisation=='fadeout':
logger.warning("Batch Normalisation mode 'fadeout' does not "
"work for less than 50%...")
mean = self.gradnet_rate * mean
std = self.gradnet_rate * std + (1-self.gradnet_rate) * 1.0
gamma = self.gradnet_rate * gamma
self.mean.updates = (self.mean,
0.9995 * self.mean + 0.0005 * T.extra_ops.squeeze(mean))
self.std.updates = (self.std,
0.9995 * self.std + 0.0005 * T.extra_ops.squeeze(std))
elif self.batch_normalisation=='predict':
mean = self.mean.dimshuffle(pattern)
std = self.std.dimshuffle(pattern)
gamma = self.gamma.dimshuffle(pattern)
else:
mean = 0
std = 1
gamma = 1
lin_output = (gamma / std) * lin_output + b - (gamma * mean / std)
lin_output = computations.apply_activation(lin_output, activation_func, b1)
if self.dropout_rate:
rng = T.shared_randomstreams.RandomStreams(int(time.time()))
p = 1 - self.dropout_rate
dropout_gate = rng.binomial(size=lin_output.shape, n=1, p=p,
dtype=theano.config.floatX)
dropout_gate *= 1.0 / p
lin_output *= dropout_gate #.dimshuffle(('x', 0))
self.output = lin_output
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
sh = self.parent.shape
for j,(i,f,p) in enumerate(zip(self.spatial_axes, self.filter_shape, self.pool_shape)):
if self.conv_mode=='valid':
k = 1 - f
elif self.conv_mode=='full':
k = f - 1
elif self.conv_mode=='same':
assert f % 2 == 1, '"same" mode is currently only supported for uneven filter sizes.'
k = 0
s = (sh[i] + k)//p
if self.mfp:
if (sh[i] + k - p + 1)%p!=0:
raise ValueError("Cannot pool spatial axis '%s' of length %i "
"by factor %i after convolving with "
"kernel of size %i and using MFP."\
%(sh.tags[i], sh[i], p, f))
else:
if (sh[i] + k)%p!=0:
raise ValueError("Cannot pool spatial axis '%s' of length %i "
"by factor %i after convolving with "
"kernel of size %i."\
%(sh.tags[i], sh[i], p, f))
sh = sh.updateshape(i, s)
if sh.fov[j]>0 and not self.invalidate_fov:
fov = sh.fov[j] + (f+p-2) * sh.strides[j]
else:
fov = -1
sh = sh.updatefov(j, fov)
if self.mfp:
sh = sh.updatemfp_offsets(self.mfp_offsets)
sh = sh.updateshape('b', np.prod(self.pool_shape), mode='mult')
sh = sh.updatestrides(self.strides)
sh = sh.updateshape('f', self.n_f)
self.shape = sh
def _calc_comp_cost(self):
"""
Calculate and set self.computational_cost.
"""
sh = self.parent.shape
n_position = 1
for i,f,p in zip(self.spatial_axes, self.filter_shape, self.pool_shape):
s = 1 - f if self.conv_mode=='valid' else f -1
n_position *= sh[i] + s
b = 1 if sh['b'] is None else sh['b']
self.computational_cost = np.product(self.w_sh) * n_position * b
[docs] def make_dual(self, parent, share_w=False, mfp=False, **kwargs):
"""
Create the inverse (``UpConv``) of this ``Conv`` node.
Most options are the same as for the layer itself.
If ``kwargs`` are not specified, the values of the primal
layers are re-used and new parameters are created.
Parameters
----------
parent: Node
The input node.
share_w: bool
If the weights (``w``) should be shared from the primal layer.
mfp: bool
If max-fragment-pooling is used.
kwargs: dict
kwargs that are passed through to the new ``UpConv`` node (see
signature of ``UpConv``).
``n_f`` and ``pool_shape`` are copied from the existing node on
which ``make_dual`` is called.
Every other parameter can be changed from the original
``Conv``'s defaults by specifying it in ``kwargs``.
Returns
-------
UpConv
The inverted conv layer (as an ``UpConv`` node).
"""
if mfp:
parent = FragmentsToDense(parent, print_repr=False)
dropout_rate = 0.0 if not self.dropout_rate else self.dropout_rate.get_value()
defaults = dict(conv_mode='valid', activation_func=self.activation_func,
batch_normalisation=self.batch_normalisation,
dropout_rate=dropout_rate,
name=self.name+'.T',
print_repr=self._print_repr,
w=None, b=None,gamma=None, mean=None, std=None)
defaults.update(kwargs)
kwargs = defaults
if share_w:
if kwargs['w'] is not None:
logger.debug("Ignoring passed w because w is shared from primal Layer.")
w = self.w
# Exchange n_in and n_f
swap = (0,2) if (self.conv_dim==3 and self.axis_order=='theano') else (0,1)
w = T.swapaxes(w, *swap)
kwargs['w'] = w
n_f = self.parent.shape['f'] # This is the output of the dual Layer
if self.w_sh[0] is not parent.shape['f']: # input of dual Layer
raise ValueError("Cannot make dual layer of:\n"
"%s\n"
"with input: %s!\n"
"The output shape of the input for the dual layer "
"must match the the input shape of the primal layer."\
%(self, parent))
return UpConv(parent, n_f, self.pool_shape, **kwargs)
def __repr__(self):
s = super(NeuralLayer, self).__repr__()
s += "\n"
s += " n_f=%i, " %(self.n_f,)
s += "%id conv, kernel=%s, pool=%s, "\
%(self.conv_dim, self.filter_shape, self.pool_shape)
s += "act='%s', " %(self.activation_func,)
if self.dropout_rate:
s += "Dropout rate=%.1f, "%(self.dropout_rate.get_value())
if self.batch_normalisation:
s += "BN in '%s' mode "%(self.batch_normalisation,)
if self.mfp:
s += "MFP active, "
return s
###############################################################################
[docs]class FragmentsToDense(Node):
def __init__(self, parent, name="to_dense", print_repr=True):
super(FragmentsToDense, self).__init__(parent, name, print_repr)
def _make_output(self):
"""
Computation of Theano output.
"""
fragments = self.parent.output
sh = self.parent.shape
if sh['b']!=len(sh.mfp_offsets) or sh['b']!=np.prod(sh.strides):
raise ValueError("Need %i fragments on the batch axis. "
"Is MFP active at all?" %np.prod(sh.strides))
self.output = computations.fragments2dense(fragments, sh.mfp_offsets,
sh.strides, sh.spatial_axes)
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
sh = self.parent.shape
for ax, st in zip(sh.spatial_axes, sh.strides):
sh = sh.updateshape(ax, st, mode='mult')
sh = sh.updateshape('b', 1)
new_strides = np.ones(len(sh.spatial_axes), np.int)
new_offsets = np.zeros((1,len(sh.spatial_axes)), np.int)
self.shape = TaggedShape(sh.shape, sh.tags, new_strides,
new_offsets, sh.fov)
def _calc_comp_cost(self):
"""
Calculate and set self.computational_cost.
For this Node type this is hard-coded to 0.
"""
self.computational_cost = 0
###############################################################################
###############################################################################
[docs]class UpConv(Conv):
"""
Upconvolution layer. Also known as transposed convolution.
See http://deeplearning.net/software/theano/tutorial/conv_arithmetic.html#transposed-convolution-arithmetic
E.g. pooling + upconv with pool_shape = 3::
x x x x x x x x x before pooling (not in this layer)
\|/ \|/ \|/ pooling (not in this layer)
x x x input to this layer
0 0 x 0 0 x 0 0 x 0 0 unpooling + padding (done in this layer)
/|\ /|\ /|\ conv on unpooled (done in this layer)
y y y y y y y y y result of this layer
Parameters
----------
parent: Node
The input node.
n_f: int
Number of filters (nodes) in layer.
pool_shape: tuple
Size of the UpConvolution.
activation_func: str
Activation function name.
identity_init: bool
Initialise weights to result in pixel repetition upsampling
batch_normalisation: str or False
Batch normalisation mode.
Can be False (inactive), "train" or "fadeout".
dropout_rate: float
Dropout rate (probability that a node drops out in a training step).
name: str
Layer name.
print_repr: bool
Whether to print the node representation upon initialisation.
w: np.ndarray or T.TensorVariable
Weight matrix.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
b: np.ndarray or T.TensorVariable
Bias vector.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
gamma
(For batch normalisation) Initialises gamma parameter.
mean
(For batch normalisation) Initialises mean parameter.
std
(For batch normalisation) Initialises std parameter.
gradnet_mode
"""
def __init__(self, parent, n_f, pool_shape, activation_func='relu',
identity_init=True, batch_normalisation=False, dropout_rate=0,
name="upconv", print_repr=True, w=None, b=None, gamma=None,
mean=None, std=None, gradnet_mode=None):
filter_shape = pool_shape
super(UpConv, self).__init__(parent, n_f, filter_shape, pool_shape,
'valid', activation_func, mfp=False,
batch_normalisation=batch_normalisation,
dropout_rate=dropout_rate, name=name,
print_repr=print_repr, w=w, b=b,
gamma=gamma, mean=mean, std=std,
gradnet_mode=gradnet_mode)
if identity_init:
try:
w_val = self.w.get_value() * 0.1
s = np.minimum(w_val.shape[0], w_val.shape[1])
s = np.arange(s)
w_val[s,s] = 1.0
self.w.set_value(w_val)
self.b.set_value(self.b.get_value()*0.0)
except:
logger.warn("identity_init failed")
def _make_output(self):
"""
Computation of Theano output.
"""
input_tensor = self.parent.output
input_shape = list(self.parent.shape)
pattern = ['x' for i in input_tensor.shape]
pattern[self.axis] = 0
activation_func = self.activation_func
if activation_func.startswith("maxout"):
r=int(activation_func.split(" ")[1])
assert r>=2
self.filter_shape /= r
if activation_func=='prelu':
b = self.b[:,0].dimshuffle(pattern)
b1 = self.b[:,1].dimshuffle(pattern)
else:
b = self.b.dimshuffle(pattern)
b1 = None
spax = self.spatial_axes
pool = np.array(self.pool_shape)
input_shape_up = np.array(input_shape)
if len(spax)==3 and not computations.dnn_avail:
unpooled = computations.unpooling(input_tensor, self.pool_shape, self.spatial_axes)
self._debug_outputs.append(unpooled)
input_shape_up[spax] = input_shape_up[spax] * pool + pool - 1
input_shape_up = list(input_shape_up)
lin_output = computations.conv(unpooled, self.w, self.axis_order,
border_mode=self.conv_mode,
x_shape=input_shape_up, w_shape=self.w_sh)
else:
input_shape_up[spax] = input_shape_up[spax] * pool
input_shape_up = list(input_shape_up)
w = T.swapaxes(self.w, 0, 1)
w_sh = list(self.w_sh)
w_sh[0], w_sh[1] = w_sh[1], w_sh[0]
lin_output = computations.upconv(input_tensor, w, self.pool_shape,
x_shape=input_shape_up,
w_shape=w_sh,
axis_order='dnn')
if self.batch_normalisation in ['train', 'fadeout']:
mean = computations.apply_except_axis(
lin_output,self.axis, T.mean).dimshuffle(pattern)
std = computations.apply_except_axis(
lin_output,self.axis, T.std).dimshuffle(pattern) + 1e-6
gamma = self.gamma.dimshuffle(pattern)
if self.batch_normalisation=='fadeout':
logger.warning("Batch normalisation mode 'fadeout' does not "
"work for less than 50%...")
mean = self.gradnet_rate * mean
std = self.gradnet_rate * std + (1-self.gradnet_rate) * 1.0
gamma = self.gradnet_rate * gamma
self.mean.updates = (self.mean,
0.9995 * self.mean + 0.0005 * T.extra_ops.squeeze(mean))
self.std.updates = (self.std,
0.9995 * self.std + 0.0005 * T.extra_ops.squeeze(std))
elif self.batch_normalisation=='predict':
mean = self.mean.dimshuffle(pattern)
std = self.std.dimshuffle(pattern)
gamma = self.gamma.dimshuffle(pattern)
else:
mean = 0
std = 1
gamma = 1
lin_output = (gamma / std) * lin_output + b - (gamma * mean / std)
lin_output = computations.apply_activation(lin_output, activation_func, b1)
if self.dropout_rate:
rng = T.shared_randomstreams.RandomStreams(int(time.time()))
p = 1 - self.dropout_rate
dropout_gate = rng.binomial(size=lin_output.shape, n=1, p=p,
dtype=theano.config.floatX)
dropout_gate *= 1.0 / p
lin_output *= dropout_gate #.dimshuffle(('x', 0))
self.output = lin_output
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
self.strides = np.divide(self.strides,self.pool_shape)
sh = self.parent.shape
for j,(i,f,p) in enumerate(zip(self.spatial_axes, self.filter_shape, self.pool_shape)):
if self.conv_mode == 'valid':
k = 1 - f
elif self.conv_mode == 'full':
k = f - 1
elif self.conv_mode == 'same':
k = 0
else:
raise ValueError('{}: Invalid conv_mode {}'.format(self.name, self.conv_mode))
s = (sh[i] * p) + p - 1 + k # unpool with margin then apply conv
sh = sh.updateshape(i, s)
# Unpooling creates asymmetric FOV (left/right is different for
# some neurons), therefore we flag the FOV as exceptional with '-1'
sh = sh.updatefov(j, -1)
sh = sh.updateshape('f', self.n_f)
sh = sh.updatestrides(self.strides)
self.shape = sh
def _calc_comp_cost(self):
"""
Calculate and set self.computational_cost.
"""
sh = self.parent.shape
n_position = 1
for i,f,p in zip(self.spatial_axes, self.filter_shape, self.pool_shape):
s = 1 - f if self.conv_mode=='valid' else f -1
n_position *= (sh[i] * p) + s
b = 1 if sh['b'] is None else sh['b']
self.computational_cost = np.product(self.w_sh) * n_position * b
def __repr__(self):
s = super(NeuralLayer, self).__repr__()
s += "\n"
s += " n_f=%i, " %(self.n_f,)
s += "%id upconv, kernel=%s, pool=%s, "\
%(self.conv_dim, self.filter_shape, self.pool_shape)
s += "act='%s', " %(self.activation_func,)
if self.dropout_rate:
s += "Dropout rate=%.1f, "%(self.dropout_rate.get_value())
if self.batch_normalisation:
s += "BN in '%s' mode "%(self.batch_normalisation,)
return s
[docs] def make_dual(self, *args, **kwargs):
raise NotImplementedError("Use Conv instead?")
[docs]class Crop(Node):
"""
This node type crops the output of its parent.
Parameters
----------
parent: Node
The input node whose output should be cropped.
crop: tuple or list of ints
Crop each spatial axis from either side by this number.
name: str
Node name.
print_repr: bool
Whether to print the node representation upon initialisation.
""" # TODO: Write an example
def __init__(self, parent, crop, name="crop", print_repr=True):
super(Crop, self).__init__(parent, name, print_repr)
self.crop=crop
def _make_output(self):
"""
Computation of Theano output.
"""
# It is assumed that all other dimensions are matching
cropper = []
k = 0
for i,s in enumerate(self.parent.shape):
if i in self.parent.shape.spatial_axes:
off = self.crop[k]
cropper.append(slice(off, s-off))
k += 1
else:
cropper.append(slice(None))
cropper = tuple(cropper)
self.output = self.parent.output[cropper]
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
sh = self.parent.shape.copy()
k = 0
for i,s in enumerate(self.parent.shape):
if i in self.parent.shape.spatial_axes:
off = self.crop[k]
sh = sh.updateshape(i,s-2*off)
k += 1
self.shape = sh
def _calc_comp_cost(self):
"""
Calculate and set self.computational_cost.
For this Node type this is hard-coded to 0.
"""
self.computational_cost = 0
# TODO: Implement for axis orders != ['b', 'f', 'z', 'x', 'y']
# TODO: Support batch_size != 1
[docs]class Pad(Node):
"""
Pads the spatial axes of its parent's output.
Parameters
----------
parent: Node
The input node whose output should be padded.
pad: tuple or list of ints
The padding length from either side for each spatial axis
value: float
Value of the padding elements (default: 0.0)
name: str
Node name.
print_repr: bool
Whether to print the node representation upon initialisation.
""" # TODO: Write an example
def __init__(self, parent, pad, value=0.0, name='pad', print_repr=True):
super(Pad, self).__init__(parent, name, print_repr)
self.pad = pad
self.value = value
if parent.shape.tags != ['b', 'f', 'z', 'x', 'y']:
raise NotImplementedError(
'Padding is currently only implemented for "b,f,z,x,y" axis order.'
'\nParent has axes {}'.format(parent.shape.tags)
)
def _make_output(self):
"""
Computation of Theano output.
"""
paddedshape = []
k = 0
for i,s in enumerate(self.parent.shape):
if i in self.parent.shape.spatial_axes:
pad = self.pad[k]
paddedshape.append(s + 2 * pad)
k += 1
else:
if s is None:
s = 1
if config.pad_b_warning_display:
logger.warning('Pad: Assumed b=1. This breaks if batch_size != 1.')
config.pad_b_warning_display = False
paddedshape.append(s)
paddedshape = tuple(paddedshape)
padded_empty = T.zeros(paddedshape, self.parent.output.dtype) + self.value
pz, px, py = self.pad
padded = T.set_subtensor(
padded_empty[
:, :,
pz:-pz,
px:-px,
py:-py
],
self.parent.output
)
self.output = padded
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
sh = self.parent.shape.copy()
k = 0
for i,s in enumerate(self.parent.shape):
if i in self.parent.shape.spatial_axes:
off = self.pad[k]
sh = sh.updateshape(i,s+2*off)
k += 1
self.shape = sh
def _calc_comp_cost(self):
"""
Calculate and set self.computational_cost.
For this Node type this is hard-coded to 0.
"""
self.computational_cost = 0
[docs]def AutoMerge(parent1, parent2, upconv_n_f=None, merge_mode='concat',
disable_upconv=False, upconv_kwargs=None,
name='merge', print_repr=True):
"""
Merges two network branches by automatic cropping and upconvolutions.
(Wrapper for
:py:class:`UpConv <elektronn2.neuromancer.neural.UpConv>`,
:py:class:`Crop <elektronn2.neuromancer.neural.Crop>`,
:py:class:`Concat <elektronn2.neuromancer.node_basic.Concat>` and
:py:class:`Add <elektronn2.neuromancer.node_basic.Add>`.)
Tries to automatically align and merge a high-res and a low-res
(convolution) output of two branches of a CNN by applying UpConv and Crop to
make their shapes and strides compatible.
UpConv is used if the low-res parent's strides are at least twice as large
as the strides of the high-res parent in any dimension.
The parents are automatically identified as high-res and low-res by their strides.
If both parents have the same strides, the concept of high-res and low-res is
ignored and this function just crops the larger parent's output until the
parents' spatial shapes match and then merges them.
This function can be used to simplify creation of e.g. architectures similar to
U-Net (see https://arxiv.org/abs/1505.04597) or skip-connections.
If a ValueError that the shapes cannot be aligned is thrown,
you can try changing the filter shapes and pooling factors of the
(grand-)parent nodes or add/remove Convolutions and Crops in the preceding
branches until the error disappears (of course you should try to keep
those changes as minimal as possible).
(This function is an alias for UpConvMerge.)
Parameters
----------
parent1: Node
First parent to be merged.
parent2: Node
Second parent to be merged.
upconv_n_f: int
Number of filters for the aligning ``UpConv`` for the low-res parent.
merge_mode: str
How the merging should be performed. Available options:
'concat' (default): Merge with a ``Concat`` node.
'add': Merge with an ``Add`` node.
disable_upconv: bool
If ``True``, no automatic upconvolutions are performed to match strides.
upconv_kwargs: dict
Additional keyword arguments that are passed to the
``UpConv`` constructor if upconvolution is applied.
name: str
Name of the final merge node.
print_repr: bool
Whether to print the node representation upon initialisation.
Returns
-------
Concat or Add
``Concat`` or ``Add`` node (depending on ``merge_mode``)
that merges the aligned high-res and low-res outputs.
"""
###TODO exchange UpConv and Crop to save computation in some cases
assert len(parent1.shape)==len(parent2.shape)
assert parent1.shape.spatial_axes == parent2.shape.spatial_axes
if any(parent2.shape.strides // parent1.shape.strides < 1):
lo_res, hi_res = parent1, parent2
else:
hi_res, lo_res = parent1, parent2
sh_hi = hi_res.shape
sh_lo = lo_res.shape
unpool = sh_lo.strides // sh_hi.strides
if np.any(unpool > 1) and not disable_upconv:
if upconv_n_f is None:
raise ValueError('AutoMerge is trying to insert an UpConv node, but'
'upconv_n_f is not defined. Please set it to the'
'desired number of features to be used for UpConv.')
logger.info(
'AutoMerge assignend parent roles:\n'
'- {}: is lo_res (strides {})\n- {}: is hi_res (strides {})'.format(
lo_res.name, lo_res.shape.strides, hi_res.name, hi_res.shape.strides
))
if upconv_kwargs is None:
upconv_kwargs = {}
lo_res = UpConv(lo_res, upconv_n_f, unpool, **upconv_kwargs)
logger.info(
'AutoMerge of {} and {}: Inserted UpConv with pool_shape {}'.format(
hi_res.name, lo_res.name, unpool
))
# Now both have same stride
# Shapes may have changed
sh_hi = hi_res.shape.spatial_shape
sh_lo = lo_res.shape.spatial_shape
crop_lo = []
crop_hi = []
for i in range(len(sh_hi)):
diff = sh_hi[i] - sh_lo[i] # different in original space
if diff % 2!=0:
raise ValueError("hi_res and lo_res maps cannot "
"be aligned with shapes:\n%s\n%s" % (sh_hi,sh_lo))
if diff > 0:
crop_hi.append(diff // 2 )
crop_lo.append(0)
else:
crop_lo.append(-diff // 2)
crop_hi.append(0)
if np.any(crop_lo):
lo_res = Crop(lo_res, crop_lo, print_repr=print_repr)
if np.any(crop_hi):
hi_res = Crop(hi_res, crop_hi, print_repr=print_repr)
if merge_mode == 'concat':
out = Concat((lo_res, hi_res), axis='f', name=name, print_repr=print_repr)
elif merge_mode == 'add':
out = Add(lo_res, hi_res, name=name, print_repr=print_repr)
else:
raise ValueError('Invalid "merge_mode". Should be "add" or "concat".')
return out
UpConvMerge = AutoMerge
[docs]class Pool(Node):
"""
Pooling layer.
Reduces the count of training parameters by reducing the spatial size
of its input by the factors given in ``pool_shape``.
Pooling modes other than max-pooling can only be selected if cuDNN is
available.
Parameters
----------
parent: Node
The input node.
pool_shape: tuple
Tuple of pooling factors (per dimension) by which the input
is downsampled.
stride: tuple
Stride sizes (per dimension).
mfp: bool
If max-fragment-pooling should be used.
mode: str
(only if cuDNN is available)
Mode can be any of the modes supported by Theano's dnn_pool():
('max', 'average_inc_pad', 'average_exc_pad', 'sum').
'max' (default): max-pooling
'average' or 'average_inc_pad': average-pooling
'sum': sum-pooling
name: str
Name of the pooling layer.
print_repr: bool
Whether to print the node representation upon initialisation.
"""
def __init__(self, parent, pool_shape, stride=None, mfp=False, mode='max',
name="pool", print_repr=True):
super(Pool, self).__init__(parent, name, print_repr)
if mfp and stride is not None:
raise ValueError("Cannot use custom stride and MFP together")
if stride is None:
stride = pool_shape
if mode == 'average':
mode = 'average_inc_pad' # Theano's internal name. 'average' is deprecated.
self.pool_shape = pool_shape
self.pool_stride = stride
self.mfp = mfp
self.mode = mode
self.strides = parent.shape.strides
self.mfp_offsets = parent.shape.mfp_offsets
self.axis = parent.shape.tag2index('f') #retrieve feature shape's index
self.axis_order = None
spatial_axes = self.parent.shape.spatial_axes
conv_dim = len(pool_shape)
x_dim = len(self.parent.shape)
n_in = parent.shape['f'] #retrieve feature shape
fail = False
if conv_dim==1:
if x_dim!=3 or spatial_axes!=[2]:
fail = True
elif conv_dim==2:
if x_dim!=4 or spatial_axes!=[2,3]:
fail = True
elif conv_dim==3:
if x_dim!=5:
fail = True
if spatial_axes==[2,3,4]:
self.axis_order = 'dnn'
elif spatial_axes==[1,3,4]:
self.axis_order = 'theano'
else:
fail = True
if fail:
raise NotImplementedError("Cannot convolve non-standard shapes / axis orders. "
"Implement reshaping before conv "
"and re-reshaping afer!")
self.spatial_axes = spatial_axes
self.conv_dim = conv_dim
def _make_output(self):
"""
Computation of Theano output.
"""
input_tensor = self.parent.output
pattern = ['x' for i in input_tensor.shape]
pattern[self.axis] = 0
if self.mfp:
assert self.pool_stride == self.pool_shape
if self.input_nodes[0].shape['b']!=1:
raise ValueError("For MFP the batchsize of the raw image input must be 1.")
lin_output, offsets_new, strides_new = computations.fragmentpool(input_tensor,
self.pool_shape,
self.mfp_offsets,
self.strides,
self.spatial_axes,
mode=self.mode)
self.mfp_offsets = offsets_new
self.strides = strides_new
else:
lin_output = computations.pooling(input_tensor,self.pool_shape,
self.spatial_axes, stride=self.pool_stride,
mode=self.mode)
self.strides = np.multiply(self.pool_stride, self.strides)
self.output = lin_output
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
sh = self.parent.shape
for j,(i,p,st) in enumerate(zip(self.spatial_axes , self.pool_shape, self.pool_stride)):
tmp = sh[i] - p + st - 1
s = tmp//st + 1
if self.mfp:
raise NotImplementedError("Check this first before use")
if (tmp - p + 1)%st!=0:
raise ValueError("Cannot downsample spatial axis '%s' of length %i "
"by factor %i with pool %i, and using MFP."\
%(sh.tags[i], sh[i], st, p))
else:
if (tmp+1)%st!=0:
raise ValueError("Cannot downsample spatial axis '%s' of length %i "
"by factor %i with pool %i."\
%(sh.tags[i], sh[i], st, p ))
sh = sh.updateshape(i, s)
if sh.fov[j]>0:
fov = sh.fov[j] + (p-1) * sh.strides[j]
else:
fov = -1
sh = sh.updatefov(j, fov)
if self.mfp:
sh = sh.updatemfp_offsets(self.mfp_offsets)
sh = sh.updateshape('b', np.prod(self.pool_shape), mode='mult')
sh = sh.updatestrides(self.strides)
self.shape = sh
[docs]class FaithlessMerge(Node):
"""
FaithlessMerge node.
Parameters
----------
hard_features: Node
easy_features: Node
axis
failing_prob: float
The higher the more often merge is unreliable
hardeasy_ratio: float
The higher the more often the harder features fail instead of the easy ones
name: str
Name of the pooling layer.
print_repr: bool
Whether to print the node representation upon initialisation.
"""
def __init__(self, hard_features, easy_features, axis='f', failing_prob=0.5,
hardeasy_ratio=0.8, name="faithless_merge", print_repr=True):
parent_nodes = (hard_features, easy_features)
super(FaithlessMerge, self).__init__(parent_nodes, name, print_repr)
if isinstance(axis, str):
self.axis = parent_nodes[0].shape.tag2index(axis)
else:
self.axis = axis
failing_prob = VariableParam(value=failing_prob,
name="failing_prob",
dtype=floatX,
apply_train=False)
hardeasy_ratio = VariableParam(value=hardeasy_ratio,
name="hardeasy_ratio",
dtype=floatX,
apply_train=False)
self.params['failing_prob'] = failing_prob
self.params['hardeasy_ratio'] = hardeasy_ratio
self.failing_prob = failing_prob
self.hardeasy_ratio = hardeasy_ratio
def _make_output(self):
"""
Computation of Theano output.
"""
# It is assumed that all other dimensions are matching
rng = T.shared_randomstreams.RandomStreams(int(time.time()))
size = [1,] * self.parent[0].output.ndim
axes = list(range(self.parent[0].output.ndim))
not_failing = rng.binomial(size=size, n=1, p=self.failing_prob,
dtype=theano.config.floatX)
not_failing = T.addbroadcast(not_failing, *axes)
hard_fails = rng.binomial(size=size, n=1, p=1-self.hardeasy_ratio,
dtype=theano.config.floatX)
hard_fails = T.addbroadcast(hard_fails, *axes)
hard = self.parent[0].output * (1 - hard_fails * not_failing)
easy = self.parent[1].output * (1 - (1 - hard_fails) * not_failing)
self.output = T.concatenate([hard, easy], axis=self.axis)
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
joint_axis_size = reduce(lambda x, y: x + y.shape[self.axis],
self.parent, 0)
# assuming all other dimensions are equal
sh = self.parent[0].shape.updateshape(self.axis, joint_axis_size)
self.shape = sh
def _calc_comp_cost(self):
"""
Calculate and set self.computational_cost.
For this node type this is hard-coded to 0.
"""
self.computational_cost = 0
[docs]class GRU(NeuralLayer):
"""
Gated Recurrent Unit Layer.
Parameters
----------
parent: Node
The input node.
memory_state: Node
Memory node.
n_f: int
Number of features.
activation_func: str
Activation function name.
flatten: bool
(Unsupported).
batch_normalisation: str or None
Batch normalisation mode.
Can be False (inactive), "train" or "fadeout".
dropout_rate: float
Dropout rate (probability that a node drops out in a training step).
name: str
Layer name.
print_repr: bool
Whether to print the node representation upon initialisation.
w: np.ndarray or T.TensorVariable
(Unsupported).
Weight matrix.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
b: np.ndarray or T.TensorVariable
(Unsupported).
Bias vector.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
gamma
(For batch normalisation) Initialises gamma parameter.
mean
(For batch normalisation) Initialises mean parameter.
std
(For batch normalisation) Initialises std parameter.
gradnet_mode
"""
def __init__(self, parent, memory_state, n_f, activation_func='tanh',
flatten=False, batch_normalisation=False, dropout_rate=0,
name="gru", print_repr=True, w=None, b=None,
gamma=None, mean=None, std=None, gradnet_mode=None):
parent_nodes = (parent, memory_state)
super(GRU, self).__init__(parent_nodes, name, print_repr)
self.n_f = n_f
self.n_f_memory = memory_state.shape['f']
self.activation_func = activation_func
self.batch_normalisation = batch_normalisation
self.gradnet_mode = gradnet_mode
self.axis = parent.shape.tag2index('f') #retrieve feature shape's index
self.spatial_axes = parent.shape.spatial_axes
self.flatten = flatten
if flatten:
raise NotImplementedError("Flatten is not yet supported for GRU.")
n_in = parent.shape.stripbatch_prod
else:
n_in = parent.shape['f']
if self.n_f_memory != n_f:
raise ValueError("n_f_memory != n_f not possible.")
if parent.shape.hastag('r'):
raise ValueError("Input must not have 'r' axis.")
n_comb = self.n_f_memory + n_in
if w != None or b != None:
raise NotImplementedError("Initial weights are not yet supported for GRU.")
w_sh = (n_comb, 3*n_f) # [h_t-1, x] x [W_z/x, W_r/x, W_h/x]
self._setup_params(w_sh, w, b, gamma, mean, std, dropout_rate)
def _make_output(self):
"""
Computation of Theano output.
"""
parent = self.parent[0].output
memory = self.parent[1].output
pattern = ['x' for i in parent.shape]
pattern[self.axis] = 0
broad_caster_shape = list(parent.shape)
broad_caster_shape[self.axis] = self.n_f_memory
broad_caster = T.ones(broad_caster_shape, dtype=memory.dtype)
memory = memory * broad_caster
input_tensor = T.concatenate([memory, parent] , axis=self.axis)
activation_func = self.activation_func
if activation_func.startswith("maxout"):
r=int(activation_func.split(" ")[1])
assert r>=2
self.n_f /= r
if activation_func=='prelu':
b = self.b[:-self.n_f,0].dimshuffle(pattern)
b_h = self.b[-self.n_f:,0].dimshuffle(pattern)
b1 = self.b[:-self.n_f,1].dimshuffle(pattern)
b1_h = self.b[-self.n_f:,1].dimshuffle(pattern)
else:
b = self.b[:-self.n_f].dimshuffle(pattern)
b_h = self.b[-self.n_f:].dimshuffle(pattern)
b1 = None
b1_h = None
lin_output = computations.dot(input_tensor, self.w[:, :-self.n_f], self.axis)
if self.batch_normalisation in ['train', 'fadeout']:
raise NotImplementedError("Batch normalisation not yet supported for GRU.")
mean = computations.apply_except_axis(
lin_output,self.axis, T.mean).dimshuffle(pattern)
std = computations.apply_except_axis(
lin_output,self.axis, T.std).dimshuffle(pattern) + 1e-6
gamma = self.gamma.dimshuffle(pattern)
if self.batch_normalisation=='fadeout':
logger.warning("Batch Normalisation mode 'fadeout' does not "
"work for less than 50%...")
mean = self.gradnet_rate * mean
std = self.gradnet_rate * std + (1-self.gradnet_rate) * 1.0
gamma = self.gradnet_rate * gamma
self.mean.updates = (self.mean,
0.9995 * self.mean + 0.0005 * T.extra_ops.squeeze(mean))
self.std.updates = (self.std,
0.9995 * self.std + 0.0005 * T.extra_ops.squeeze(std))
elif self.batch_normalisation=='predict':
raise NotImplementedError("Batch normalisation not yet supported for GRU.")
mean = self.mean.dimshuffle(pattern)
std = self.std.dimshuffle(pattern)
gamma = self.gamma.dimshuffle(pattern)
else:
mean = 0
std = 1
gamma = 1
lin_output = (gamma / std) * lin_output + b - (gamma * mean / std)
act = computations.apply_activation(lin_output, 'sig', b1)
slice_obj = [slice(None) for i in range(act.ndim)]
slice_obj[self.axis] = slice(0, self.n_f)
z = act[slice_obj]
slice_obj[self.axis] = slice(self.n_f, None)
r = act[slice_obj]
gated_input = T.concatenate([r*memory, parent], axis=self.axis)
h_tilde = computations.dot(gated_input, self.w[:, -self.n_f:], self.axis)
h_tilde = (gamma / std) * h_tilde + b_h - (gamma * mean / std)
h_tilde = computations.apply_activation(h_tilde, activation_func, b1_h)
act = (1 - z) * memory + z * h_tilde
self._debug_outputs = [memory, act, z, r,]
if self.dropout_rate:
raise NotImplementedError("Dropout not yet supported for GRU.")
rng = T.shared_randomstreams.RandomStreams(int(time.time()))
p = 1 - self.dropout_rate
dropout_gate = rng.binomial(size=(self.n_f,), n=1, p=p,
dtype=theano.config.floatX)
dropout_gate *= 1.0 / p
act = act * dropout_gate.dimshuffle(('x', 0))
self.output = act
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
sh = self.parent[0].shape
if self.flatten:
self.shape = TaggedShape((sh['b'], self.n_f), 'b,f')
else:
self.shape = sh.updateshape('f', self.n_f)
def _calc_comp_cost(self):
"""
Calculate and set self.computational_cost.
"""
n = self.parent[0].shape.stripnone_prod
self.computational_cost = 3 * n * self.n_f
[docs]class LSTM(NeuralLayer):
"""
Long short term memory layer.
Using an implementation without peepholes in f, i, o, i.e. weights
cell state is not taken into account for weights. See
http://colah.github.io/posts/2015-08-Understanding-LSTMs/.
Parameters
----------
parent: Node
The input node.
memory_states: Node
Concatenated (initial) feed-back and cell state (one Node!).
n_f: int
Number of features.
activation_func: str
Activation function name.
flatten
batch_normalisation: str or None
Batch normalisation mode.
Can be False (inactive), "train" or "fadeout".
dropout_rate: float
Dropout rate (probability that a node drops out in a training step).
name: str
Layer name.
print_repr: bool
Whether to print the node representation upon initialisation.
w: np.ndarray or T.TensorVariable
Weight matrix.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
b: np.ndarray or T.TensorVariable
Bias vector.
If this is a np.ndarray, its values are used to initialise a
shared variable for this layer.
If it is a T.TensorVariable, it is directly used (weight sharing
with the layer which this variable comes from).
gamma
(For batch normalisation) Initialises gamma parameter.
mean
(For batch normalisation) Initialises mean parameter.
std
(For batch normalisation) Initialises std parameter.
gradnet_mode
"""
def __init__(self, parent, memory_states, n_f, activation_func='tanh',
flatten=False, batch_normalisation=False, dropout_rate=0,
name="lstm", print_repr=True, w=None, b=None,
gamma=None, mean=None, std=None, gradnet_mode=None):
parent_nodes = (parent, memory_states)
super(LSTM, self).__init__(parent_nodes, name, print_repr)
self.n_f = n_f
self.n_f_memory = memory_states.shape['f']
self.activation_func = activation_func
self.batch_normalisation = batch_normalisation
self.gradnet_mode = gradnet_mode
self.axis = parent.shape.tag2index('f') #retrieve feature shape's index
self.spatial_axes = parent.shape.spatial_axes
self.flatten = flatten
if flatten:
raise NotImplementedError("Flatten is not yet supported for LSTM.")
else:
n_in = parent.shape['f']
n_comb = n_f + n_in
if w != None or b != None:
raise NotImplementedError("Initial weights are not yet supported for LSTM.")
if self.n_f_memory != 2*n_f:
raise ValueError("n_f of memory_states must be 2*n_f.")
if parent.shape.hastag('r'):
raise ValueError("Input must not have 'r' axis.")
w_sh = (n_comb, 4*n_f) # f, i, o, C
self._setup_params(w_sh, w, b, gamma, mean, std, dropout_rate)
def _make_output(self):
"""
Computation of Theano output.
"""
parent = self.parent[0].output
memory = self.parent[1].output
broad_caster_shape = list(parent.shape)
broad_caster_shape[self.axis] = self.n_f_memory
broad_caster = T.ones(broad_caster_shape, dtype=memory.dtype)
memory = memory * broad_caster
slice_obj = [slice(None) for i in range(len(self.parent[1].shape))]
slice_obj[self.parent[1].shape.tag2index('f')] = slice(self.n_f)
feed_back = memory[slice_obj]
slice_obj[self.parent[1].shape.tag2index('f')] = slice(self.n_f, None)
cell_state = memory[slice_obj]
input_tensor = T.concatenate([feed_back, parent] ,
axis=self.axis) #h, x
pattern = ['x' for i in input_tensor.shape]
pattern[self.axis] = 0
activation_func = self.activation_func
if activation_func.startswith("maxout"):
r=int(activation_func.split(" ")[1])
assert r>=2
self.n_f /= r
if activation_func=='prelu':
b = self.b[:, 0].dimshuffle(pattern)
b1 = self.b[:, 1]
b1_f = b1[:self.n_f].dimshuffle(pattern)
b1_i = b1[self.n_f:2*self.n_f].dimshuffle(pattern)
b1_o = b1[2*self.n_f:3*self.n_f].dimshuffle(pattern)
b1_c = b1[3*self.n_f:].dimshuffle(pattern)
else:
b = self.b.dimshuffle(pattern)
b1_f = None
b1_i = None
b1_o = None
b1_c = None
lin_output = computations.dot(input_tensor, self.w, self.axis)
if self.batch_normalisation in ['train', 'fadeout']:
raise NotImplementedError("Batch normalisation not yet supported for LSTM.")
mean = computations.apply_except_axis(
lin_output,self.axis, T.mean).dimshuffle(pattern)
std = computations.apply_except_axis(
lin_output,self.axis, T.std).dimshuffle(pattern) + 1e-6
gamma = self.gamma.dimshuffle(pattern)
if self.batch_normalisation=='fadeout':
logger.warning("Batch Normalisation mode 'fadeout' does not "
"work for less than 50%...")
mean = self.gradnet_rate * mean
std = self.gradnet_rate * std + (1-self.gradnet_rate) * 1.0
gamma = self.gradnet_rate * gamma
self.mean.updates = (self.mean,
0.9995 * self.mean + 0.0005 * T.extra_ops.squeeze(mean))
self.std.updates = (self.std,
0.9995 * self.std + 0.0005 * T.extra_ops.squeeze(std))
elif self.batch_normalisation=='predict':
raise NotImplementedError("Batch normalisation not yet supported for LSTM.")
mean = self.mean.dimshuffle(pattern)
std = self.std.dimshuffle(pattern)
gamma = self.gamma.dimshuffle(pattern)
else:
mean = 0
std = 1
gamma = 1
lin_output = (gamma / std) * lin_output + b - (gamma * mean / std)
slice_obj = [slice(None) for i in range(lin_output.ndim)]
slice_obj[self.axis] = slice(self.n_f)
f = computations.apply_activation(lin_output[slice_obj], 'sig', b1_f)
slice_obj[self.axis] = slice(self.n_f, 2*self.n_f)
i = computations.apply_activation(lin_output[slice_obj], 'sig', b1_i)
slice_obj[self.axis] = slice(2*self.n_f, 3*self.n_f)
o = computations.apply_activation(lin_output[slice_obj], 'sig', b1_o)
slice_obj[self.axis] = slice(3*self.n_f, 4*self.n_f)
c_tilde = computations.apply_activation(lin_output[slice_obj], activation_func, b1_c)
cell_out = f * cell_state + i * c_tilde
lin_output = o * computations.apply_activation(cell_out, activation_func, None)
if self.dropout_rate:
raise NotImplementedError("Dropout not yet supported for LSTM.")
rng = T.shared_randomstreams.RandomStreams(int(time.time()))
p = 1 - self.dropout_rate
dropout_gate = rng.binomial(size=(self.n_f,), n=1, p=p,
dtype=theano.config.floatX)
dropout_gate *= 1.0 / p
lin_output = lin_output * dropout_gate.dimshuffle(('x', 0))
self.output = T.concatenate([lin_output, cell_out], axis=self.axis)
def _calc_shape(self):
"""
Calculate and set self.shape.
"""
sh = self.parent[0].shape
if self.flatten:
self.shape = TaggedShape((sh['b'], 2*self.n_f), 'b,f')
else:
self.shape = sh.updateshape('f',2* self.n_f)
def _calc_comp_cost(self):
"""
Calculate and set self.computational_cost.
"""
n = self.parent[0].shape.stripnone_prod
self.computational_cost = 4 * n * self.n_f
[docs]class LRN(Node):
"""
LRN (Local Response Normalization) layer.
Parameters
----------
parent: Node
The input node.
filter_shape: tuple
mode: str
Can be "spatial" or "channel".
alpha: float
k: float
beta: float
name: str
Node name.
print_repr: bool
Whether to print the node representation upon initialisation.
"""
def __init__(self, parent, filter_shape, mode='spatial', alpha=0.0001,
k=1, beta=0.75, name="LRN", print_repr=True):
super(LRN, self).__init__(parent, name, print_repr)
self.mode = mode
self.filter_shape = filter_shape
self.axis = parent.shape.tag2index('f') # retrieve feature shape's index
if mode=='spatial':
self.axis_order = None
self.spatial_axes = self.parent.shape.spatial_axes
conv_dim = len(self.spatial_axes)
x_dim = len(self.parent.shape)
if len(self.spatial_axes)!=len(filter_shape):
raise ValueError("The filter_shape dimensionality (%i) and the number "
"of spatial dimensions in the input (%i)differ! "
"Use filter size 1 on axes which should not be "
"averaged."\
%(len(filter_shape), conv_dim, ))
n_in = parent.shape['f'] #retrieve feature shape
fail = False
if conv_dim==1:
if x_dim!=3 or self.spatial_axes!=[2]:
fail = True
w_sh = [n_in, n_in] + list(filter_shape)
elif conv_dim==2:
if x_dim!=4 or self.spatial_axes!=[2,3]:
fail = True
w_sh = [n_in, n_in] + list(filter_shape)
elif conv_dim==3:
if x_dim!=5:
fail = True
if self.spatial_axes==[2,3,4]:
self.axis_order = 'dnn'
w_sh = [n_in, n_in] + list(filter_shape)
elif self.spatial_axes==[1,3,4]:
self.axis_order = 'theano'
w_sh = [n_in, filter_shape[0], n_in] + list(filter_shape[1:])
else:
fail = True
if fail:
raise NotImplementedError("Cannot convolve non-standard shapes / axis orders. "
"Implement reshaping before conv"
"and re-reshaping afer!")
self.conv_dim = conv_dim
self.w_sh = w_sh
value = np.zeros(w_sh, dtype=floatX)
val = 1.0 / np.product(filter_shape)
for i in range(n_in):
value[i,i] = val
self.average_filter = ConstantParam(value, '<%s_filter%s>'%(self.name, tuple(w_sh)))
self.params['average_filter'] = self.average_filter
elif mode=='channel':
assert isinstance(filter_shape, int)
assert filter_shape%2==1
else:
raise ValueError("Unknow mode %s"%mode)
self.alpha = VariableParam(value=alpha, name="alpha",
dtype=floatX, apply_train=False)
self.beta = VariableParam(value=beta,name="beta",
dtype=floatX, apply_train=False)
self.k = VariableParam(value=k,name="k",
dtype=floatX, apply_train=False)
self.params['alpha'] = self.alpha
self.params['beta'] = self.beta
self.params['k'] = self.k
def _make_output(self):
"""
Computation of Theano output.
"""
input_tensor = self.parent.output
input_shape = list(self.parent.shape)
if self.mode=='spatial':
mean_square = computations.conv(T.square(input_tensor), self.average_filter,
self.axis_order, border_mode='same',
x_shape=input_shape, w_shape=self.w_sh)
else:
n_f = input_shape[self.axis]
in_square = T.square(input_tensor)
half_n = self.filter_shape // 2
new_sh = list(input_tensor.shape)
new_sh[self.axis] += 2 * half_n
in_square_ext = T.zeros(new_sh, floatX)
slicer = [slice(None)] * input_tensor.ndim
slicer[self.axis] = slice(half_n,half_n+n_f)
in_square_ext = T.set_subtensor(in_square_ext[slicer], in_square)
# pad left
slicer[self.axis] = slice(0, half_n)
pad_slicer = [slice(None)] * input_tensor.ndim
pad_slicer[self.axis] = slice(0, 1)
in_square_ext = T.set_subtensor(in_square_ext[slicer], in_square[pad_slicer])
# pad right
slicer[self.axis] = slice(half_n+n_f, 2*half_n+n_f)
pad_slicer[self.axis] = slice(n_f-1,n_f)
in_square_ext = T.set_subtensor(in_square_ext[slicer], in_square[pad_slicer])
mean_square = 0
for i in range(self.filter_shape):
slicer[self.axis] = slice(i,i+n_f)
mean_square += in_square_ext[slicer]
mean_square /= self.filter_shape
divisor = T.power(self.k + self.alpha * mean_square, self.beta)
self.output = input_tensor / divisor
self._debug_outputs = [mean_square, divisor]