Source code for elektronn2.data.cnndata

# -*- coding: utf-8 -*-
# ELEKTRONN2 Toolkit
# Copyright (c) 2015 Marius Killinger
# All rights reserved

from __future__ import absolute_import, division, print_function
from builtins import filter, hex, input, int, map, next, oct, pow, range, super, zip

__all__ = ['AgentData', 'BatchCreatorImage', 'GridData']

import gc
import logging
import os
import sys
import time
import getpass
from . import non_geometric_augmentation as nga


try:
    from importlib import reload
except ImportError:
    pass  # reload() is built-in in Python 2

import h5py
import numpy as np
import tqdm

from . import transformations
from .image import make_affinities, make_nhood_targets
from .. import utils
from ..config import config

logger = logging.getLogger('elektronn2log')
inspection_logger = logging.getLogger('elektronn2log-inspection')

user_name = getpass.getuser()

###############################################################################


def greyAugment(d, channels, rng):
    """
    Performs grey value (histogram) augmentations on ``d``. This is only
    applied to ``channels`` (list of channels indices), ``rng`` is a random
    number generator
    """
    if channels == []:
        return d
    else:
        k = len(channels)
        d = d.copy()  # d is still just a view, we don't want to change the original data so copy it
        alpha = 1 + (rng.rand(k) - 0.5) * 0.3 # ~ contrast
        c     = (rng.rand(k) - 0.5) * 0.3 # mediates whether values are clipped for shadows or lights
        gamma = 2.0 ** (rng.rand(k) * 2 - 1) # sample from [0.5,2] with mean 0

        d[channels] = d[channels] * alpha[:,None,None] + c[:,None,None]
        d[channels] = np.clip(d[channels], 0, 1)
        d[channels] = d[channels] ** gamma[:,None,None]
    return d


def border_treatment(data_list, ps, border_mode, ndim):
    def treatArray(data):
        if border_mode=='keep':
            return data

        sh = data.shape[1:] # (z,y,x)/(x,y)


        if border_mode=='crop':
            excess = [int((x[0] - x[1])//2) for x in zip(sh, ps)]
            if ndim == 3:
                data = data[:,
                            excess[0]:excess[0]+ps[0],
                            excess[1]:excess[1]+ps[1],
                            excess[2]:excess[2]+ps[2]]
            elif ndim==2:
                data = data[:,
                            :,
                            excess[0]:excess[0]+ps[0],
                            excess[1]:excess[1]+ps[1]]

        else:
            excess_l = [int(np.ceil(float(x[0] - x[1])/2)) for x in zip(ps, sh)]
            excess_r = [int(np.floor(float(x[0] - x[1])/2)) for x in zip(ps, sh)]
            if ndim == 3:
                pad_with = [(0,0),
                            (excess_l[0],excess_r[0]),
                            (excess_l[1],excess_r[1]),
                            (excess_l[2],excess_r[2])]
            else:
                pad_with = [(0,0),
                            (0,0),
                            (excess_l[0],excess_r[0]),
                            (excess_l[1],excess_r[1])]

            if border_mode=='mirror':
                data = np.pad(data, pad_with, mode='symmetric')

            if border_mode=='0-pad':
                data = np.pad(data, pad_with, mode='constant', constant_values=0)

        return data

    return [treatArray(d) for d in data_list]


[docs]class BatchCreatorImage(object):
    def __init__(self, input_node, target_node=None, d_path=None, l_path=None,
                 d_files=None, l_files=None, cube_prios=None, valid_cubes=None,
                 border_mode='crop', aniso_factor=2, target_vec_ix=None,
                 target_discrete_ix=None, h5stream=False, zxy=True):

        assert (d_path and l_path and d_files and l_files)
        if len(d_files) != len(l_files):
            raise ValueError(
                "d_files and l_files must be lists of same length!")
        d_path = os.path.expanduser(d_path)
        l_path = os.path.expanduser(l_path)
        self.zxy = zxy
        self.h5stream = h5stream
        self.d_path = d_path
        self.l_path = l_path
        self.d_files = d_files
        self.l_files = l_files
        self.cube_prios = cube_prios
        self.valid_cubes = valid_cubes if valid_cubes is not None else []
        self.aniso_factor = aniso_factor
        self.border_mode = border_mode
        self.target_vec_ix = target_vec_ix
        self.target_discrete_ix = target_discrete_ix

        # Infer geometric info from input/target shapes
        self.ndim = input_node.shape.ndim
        self.patch_size = np.array(input_node.shape.spatial_shape, dtype=np.int)
        self.strides = np.array(target_node.shape.strides, dtype=np.int)
        self.offsets = np.array(target_node.shape.offsets, dtype=np.int)
        self.target_ps = self.patch_size - self.offsets * 2
        self.t_dtype = target_node.output.dtype
        if input_node.shape.ndim == target_node.shape.ndim:
            self.mode = 'img-img'
        else:
            if target_node.shape.ndim > 1:
                raise ValueError()  ###TODO would it work to map 'img-vect'?
            self.mode = 'img-scalar'

        # The following will be inferred when reading data
        self.n_labelled_pixel = 0
        self.n_f = None  # number of channels/feature in input
        self.t_n_f = None  # the shape of the returned label batch at index 1

        # Actual data fields
        self.valid_d = []
        self.valid_l = []
        self.valid_extra = []

        self.train_d = []
        self.train_l = []
        self.train_extra = []

        # Setup internal stuff
        self.rng = np.random.RandomState(np.uint32((time.time() * 0.0001 -
                                                    int(
                                                        time.time() * 0.0001)) * 4294967295))
        self.pid = os.getpid()
        self.gc_count = 1

        self._sampling_weight = None
        self._training_count = None
        self._valid_count = None
        self.n_successful_warp = 0
        self.n_failed_warp = 0

        self.load_data()
        logger.info('{}\n'.format(self.__repr__()))

    def __repr__(self):
        s = "{0:,d}-target Data Set with {1:,d} input channel(s):\n" + \
            "#train cubes: {2:,d} and #valid cubes: {3:,d}, {4:,d} labelled " + \
            "pixels."
        s = s.format(self.t_n_f, self.n_f, self._training_count,
                     self._valid_count, self.n_labelled_pixel)
        return s

    @property
    def warp_stats(self):
        return "Warp stats: successful: %i, failed %i, quota: %.1f" % (
            self.n_successful_warp, self.n_failed_warp,
            float(self.n_successful_warp) / (
            self.n_failed_warp + self.n_successful_warp))

    def _reseed(self):
        """Reseeds the rng if the process ID has changed!"""
        current_pid = os.getpid()
        if current_pid != self.pid:
            self.pid = current_pid
            self.rng.seed(np.uint32((time.time() * 0.0001 -
                                     int(
                                         time.time() * 0.0001)) * 4294967295 + self.pid))
            reload(
                transformations)  # needed for numba GIL released stuff to work
            logger.debug(
                "Reseeding RNG in Process with PID: {}".format(self.pid))

    def _allocbatch(self, batch_size):
        images = np.zeros((batch_size, self.n_f,) + tuple(self.patch_size),
                          dtype='float32')
        sh = self.patch_size - self.offsets * 2
        target = np.zeros((batch_size, self.t_n_f) + tuple(sh),
                          dtype=self.t_dtype)
        return images, target

[docs]    def getbatch(self, batch_size=1, source='train',
                 grey_augment_channels=None, warp=False, warp_args=None,
                 ignore_thresh=False, force_dense=False,
                 affinities=False, nhood_targets=False, ret_ll_mask=False,
                 nga_blur_noise_probability=False,
                 blurry_blobs_probability=False,
                 blurry_blobs_args=None,
                 noisy_random_erasing_probability=0,
                 noisy_random_erasing_args=None,
                 uniform_random_erasing_probability=0,
                 uniform_random_erasing_args=None):
        """
        Prepares a batch by randomly sampling, shifting and augmenting
        patches from the data

        Parameters
        ----------
        batch_size: int
            Number of examples in batch (for CNNs often just 1)
        source: str
            Data set to draw data from: 'train'/'valid'
        grey_augment_channels: list
            List of channel indices to apply grey-value augmentation to
        warp: bool or float
            Whether warping/distortion augmentations are applied to examples
            (slow --> use multiprocessing). If this is a float number,
            warping is applied to this fraction of examples e.g. 0.5 --> every
            other example.
        warp_args: dict
            Additional keyword arguments that get passed through to
            elektronn2.data.transformations.get_warped_slice()
        ignore_thresh: float
            If the fraction of negative targets in an example patch exceeds this
            threshold, this example is discarded (Negative targets are ignored
            for training [but could be used for unsupervised target propagation]).
        force_dense: bool
            If True the targets are *not* sub-sampled according to the CNN output\
            strides. Dense targets requires MFP in the CNN!
        affinities
        nhood_targets
        ret_ll_mask: bool
            If True additional information for reach batch example is returned.
            Currently implemented are two ll_mask arrays to indicate the targeting mode.
            The first dimension of those arrays is the batch_size!
        nga_blur_noise_probability: bool or float
            Probability of applying a Gaussian filter and noise to the input data.
            The value must be a bool or be within the range [0.0, 1.0]
            Default: False (disabled)
        blurry_blobs_probability: bool or float
            Probability of augmenting the input data with blurry "blobs".
            "Blobs" mean random cubes across the input data. The region
            within a cube is blurred with Gaussian smoothing. The level of smoothing
            and the position of each cube are random.
            The value must be a bool or be within the range [0.0, 1.0]
            Default: False (disabled)
        blurry_blobs_args: dict
            Additional keyword arguments that get passed through to
            elektronn2.data.non_geometric_augmentation.blurry_blobs().
            Effective if setting blurry_blobs_probability > 0.
        noisy_random_erasing_probability: float
            Probability of augmenting the input data with noise blobs. The blob
            region is filled with random noise values between 0 and 255.
        noisy_random_erasing_args: dict
            Additional keyword arguments that get passed through to
            elektronn2.data.non_geometric_augmentation.noisy_random_erasing().
            Effective if setting noisy_random_erasing_probability > 0.
        uniform_random_erasing_probability: float
            Probability of augmenting the input data with uniform blobs.
            The blob region is filled with one value.
        uniform_random_erasing_args: dict
            Additional keyword arguments that get passed through to
            elektronn2.data.non_geometric_augmentation.uniform_random_erasing().
            Effective if uniform_random_erasing_probability > 0.
        Returns
        -------
        data: np.ndarray
            [bs, ch, x, y] or [bs, ch, z, y, x] for 2D and 3D CNNS
        target: np.ndarray
            [bs, ch, x, y] or [bs, ch, z, y, x]
        ll_mask1: np.ndarray
            (optional) [bs, n_target]
        ll_mask2: np.ndarray
            (optional) [bs, n_target]
        """
        # This is especially required for multiprocessing
        if grey_augment_channels is None:
            grey_augment_channels = []
        self._reseed()
        images, target = self._allocbatch(batch_size)
        ll_masks = []
        patch_count = 0
        while patch_count < batch_size:  # Loop to fill up batch with examples
            d, t, ll_mask = self._getcube(source)  # get cube randomly

            try:
                if self.mode == 'img-img':
                    d, t = self.warp_cut(d, t, warp, warp_args)
                else:
                    d, _ = self.warp_cut(d, None, warp, warp_args)
                self.n_successful_warp += 1

            except transformations.WarpingOOBError:
                self.n_failed_warp += 1
                continue

            # Check only if a ignore_thresh is set and the cube is labelled
            if (ignore_thresh is not False) and (not np.any(ll_mask[1])):
                if (t < 0).mean() > ignore_thresh:
                    continue  # do not use cubes which have no information

            if source == "train":  # no grey/non-geometric augmentation for testing
                d = greyAugment(d, grey_augment_channels, self.rng)

                if nga_blur_noise_probability:
                    random_value = np.random.rand()
                    if random_value < nga_blur_noise_probability:
                        nga.blur_augment(d, level=1, data_overwrite=False)
                        nga.noise_augment(d, level=0.15, data_overwrite=True)

                if blurry_blobs_probability > 0\
                   and np.random.rand() < blurry_blobs_probability:
                    d = nga.blurry_blobs(data=d, **blurry_blobs_args)

                if uniform_random_erasing_probability > 0\
                   and np.random.rand() < uniform_random_erasing_probability:
                    d = nga.uniform_random_erasing(data=d,
                                                   **uniform_random_erasing_args)

                if noisy_random_erasing_probability > 0\
                   and np.random.rand() < noisy_random_erasing_probability:
                    d = nga.noisy_random_erasing(data=d,
                                                 **noisy_random_erasing_args)

            target[patch_count] = t
            images[patch_count] = d
            ll_masks.append(ll_mask)
            patch_count += 1

        # Patch / Nhood target stuff
        if nhood_targets is not None and nhood_targets is not False:
            if isinstance(nhood_targets, np.ndarray):
                nhood = nhood_targets
            else:
                nhood = np.array([[0, 0, 0],
                                  [1, 0, 0],
                                  [-1, 0, 0],
                                  [0, 1, 0],
                                  [0, -1, 0],
                                  [0, 0, 1],
                                  [0, 0, -1]], dtype=np.int)

            if target.shape[1] == 1:
                target = make_nhood_targets(target, nhood)
            else:  # Nhood targets for first feature only, keep other targets
                tmp = make_nhood_targets(target[:, 0:1], nhood)
                target = np.concatenate([tmp, target[:, 1:]], axis=1)

        # Final modification of targets: striding and replacing nan
        if not (force_dense or np.all(self.strides == 1)):
            target = self._stridedtargets(target)

        ret = [images, target]  # The "normal" batch

        # MALIS stuff
        if affinities == 'malis':
            # create aff from seg IDs (i.e. shape[1] must be 1)
            aff, seg = make_affinities(target[:, 0])  # [bs, 3, z, y, x]
            seg = seg[:, None]  # add 'class' dimension
            ret = [images, aff, seg]
        elif affinities == 'affinity':
            aff, seg = make_affinities(target)  # [bs, z, y, x, 3]
            ret = [images, aff]

        # Lazy Labels stuff
        if ret_ll_mask:  # ll_mask is now a list(bs) of tuples(2)
            ll_masks = np.atleast_3d(
                np.array(ll_masks, dtype=np.int16))  # (bs, 2, 5)
            ll_mask1 = ll_masks[:, 0]
            ll_mask2 = ll_masks[:, 1]
            ret += [ll_mask1, ll_mask2]

        target[
            np.isnan(target)] = -666  # hack because theano does not support nan

        self.gc_count += 1
        if self.gc_count % 1000 == 0:
            gc.collect()

        return tuple(ret)

[docs]    def warp_cut(self, img, target, warp, warp_params):
        """
        (Wraps :py:meth:`elektronn2.data.transformations.get_warped_slice()`)

        Cuts a warped slice out of the input and target arrays.
        The same random warping transformation is each applied to both input
        and target.

        Warping is randomly applied with the probability defined by the ``warp``
        parameter (see below).

        Parameters
        ----------
        img: np.ndarray
            Input image
        target: np.ndarray
            Target image
        warp: float or bool
            False/True disable/enable warping completely.
            If ``warp`` is a float, it is used as the ratio of inputs that
            should be warped.
            E.g. 0.5 means approx. every second call to this function actually
            applies warping to the image-target pair.
        warp_params: dict
            kwargs that are passed through to
            :py:meth:`elektronn2.data.transformations.get_warped_slice()`.
            Can be empty.

        Returns
        -------
        d: np.ndarray
            (Warped) input image slice
        t: np.ndarray
            (Warped) target slice
        """
        if (warp is True) or (warp == 1):  # always warp
            do_warp = True
        elif (0 < warp < 1):  # warp only a fraction of examples
            do_warp = True if (self.rng.rand() < warp) else False
        else:  # never warp
            do_warp = False

        if not do_warp:
            warp_params = dict(warp_params)
            warp_params['warp_amount'] = 0

        d, t = transformations.get_warped_slice(img, self.patch_size,
                                                aniso_factor=self.aniso_factor,
                                                target=target,
                                                target_ps=self.target_ps,
                                                target_vec_ix=self.target_vec_ix,
                                                target_discrete_ix=self.target_discrete_ix,
                                                rng=self.rng, **warp_params)

        return d, t

    def _getcube(self, source):
        """
        Draw an example cube according to sampling weight on training data,
        or randomly on valid data
        """
        if source == 'train':
            p = self.rng.rand()
            i = np.flatnonzero(self._sampling_weight <= p)[-1]
            d, t, ll_mask = self.train_d[i], self.train_l[i], self.train_extra[
                i]
        elif source == "valid":
            if len(self.valid_d) == 0:
                logger.info(
                    "Validation Set empty. Disable testing on validation set.")
                raise ValueError("No validation set")

            i = self.rng.randint(0, len(self.valid_d))
            d = self.valid_d[i]
            t = self.valid_l[i]
            ll_mask = self.valid_extra[i]
        else:
            raise ValueError("Unknown data source")

        return d, t, ll_mask

    def _stridedtargets(self, lab):
        if self.ndim == 3:
            return lab[:, :, ::self.strides[0], ::self.strides[1],
                   ::self.strides[2]]
        elif self.ndim == 2:
            return lab[:, :, ::self.strides[0], ::self.strides[1]]

[docs]    def load_data(self):
        """
        Parameters
        ----------

        d_path/l_path: string
          Directories to load data from
        d_files/l_files: list
          List of data/target files in <path> directory (must be in the same order!).
          Each list element is a tuple in the form
          **(<Name of h5-file>, <Key of h5-dataset>)**
        cube_prios: list
          (not normalised) list of sampling weights to draw examples from
          the respective cubes. If None the cube sizes are taken as priorities.
        valid_cubes: list
          List of indices for cubes (from the file-lists) to use as validation
          data and exclude from training, may be empty list to skip performance
          estimation on validation data.
        """
        # returns lists of cubes, ll_mask is a tuple per cube
        data, target, extras = self.read_files()

        if self.mode == 'img-scalar':
            data = border_treatment(data, self.patch_size, self.border_mode,
                                    self.ndim)

        default_extra = (np.ones(self.t_n_f), np.zeros(self.t_n_f))
        extras = [default_extra if x is None else x for x in extras]

        prios = []
        # Distribute Cubes into training and valid list
        for k, (d, t, e) in enumerate(zip(data, target, extras)):
            if k in self.valid_cubes:
                self.valid_d.append(d)
                self.valid_l.append(t)
                self.valid_extra.append(e)
            else:
                self.train_d.append(d)
                self.train_l.append(t)
                self.train_extra.append(e)
                # If no priorities are given: sample proportional to cube size
                prios.append(t.size)

        if self.cube_prios is None:
            prios = np.array(prios, dtype=np.float)
        else:  # If priorities are given: sample irrespective of cube size
            prios = np.array(self.cube_prios, dtype=np.float)

        # sample example i if: batch_prob[i] < p
        self._sampling_weight = np.hstack((0, np.cumsum(prios / prios.sum())))
        self._training_count = len(self.train_d)
        self._valid_count = len(self.valid_d)

[docs]    def check_files(self):
        """
        Check if file paths in the network config are available.
        """
        notfound = False
        give_neuro_data_hint = False
        fullpaths = [os.path.join(self.d_path, f) for f, _ in self.d_files] + \
                    [os.path.join(self.l_path, f) for f, _ in self.l_files]
        for p in fullpaths:
            if not os.path.exists(p):
                print('{} not found.'.format(p))
                notfound = True
                if 'neuro_data_zxy' in p:
                    give_neuro_data_hint = True
        if give_neuro_data_hint:
            print(
                '\nIt looks like you are referencing the neuro_data_zxy dataset.\n'
                'To install the neuro_data_xzy dataset to the default location, run:\n'
                '  $ wget http://elektronn.org/downloads/neuro_data_zxy.zip\n'
                '  $ unzip neuro_data_zxy.zip -d ~/neuro_data_zxy')
        if notfound:
            print('\nPlease fetch the necessary dataset and/or '
                  'change the relevant file paths in the network config.')
            sys.stdout.flush()
            sys.exit(1)

[docs]    def read_files(self):
        """
        Image files on disk are expected to be in order (ch,x,y,z) or (x,y,z)
        But image stacks are returned as (z,ch,x,y) and target as (z,x,y,)
        irrespective of the order in the file. If the image files have no
        channel this dimension is extended to a singleton dimension.
        """
        self.check_files()
        data, target, extras = [], [], []
        pbar = tqdm.tqdm(total=len(self.d_files), ncols=120, leave=False)

        for (d_f, d_key), (l_f, l_key) in zip(self.d_files, self.l_files):
            pbar.write('Loading %s and %s' % (d_f, l_f))
            if self.h5stream:
                d = h5py.File(os.path.join(self.d_path, d_f), 'r')[d_key]
                t = h5py.File(os.path.join(self.l_path, l_f), 'r')[l_key]
                assert d.compression == t.compression == None
                assert len(d.shape) == len(t.shape) == 4
                assert d.dtype == np.float32
                assert t.dtype == self.t_dtype

            else:
                d = utils.h5load(os.path.join(self.d_path, d_f), d_key)
                t = utils.h5load(os.path.join(self.l_path, l_f), l_key)

            try:
                ll_mask_1 = utils.h5load(os.path.join(self.l_path, l_f),
                                         'll_mask')
                if not self.zxy:
                    ll_mask_1 = transformations.xyz2zxy(ll_mask_1)
                extras.append(ll_mask_1)
            except KeyError:
                extras.append(None)
            if not self.zxy:
                d = transformations.xyz2zxy(d)
                t = transformations.xyz2zxy(t)
            if self.mode == 'img-scalar':
                assert t.ndim == 1, "Scalar targets must be 1d"

            if len(d.shape) == 4:  # h5 dataset has no ndim
                self.n_f = d.shape[0]
            elif len(d.shape) == 3:  # We have no channels in data
                self.n_f = 1
                d = d[None]  # add (empty) 0-axis

            if len(t.shape) == 3:  # If labels not empty add first axis
                t = t[None]

            if self.t_n_f is None:
                self.t_n_f = t.shape[0]
            else:
                assert self.t_n_f == t.shape[0]

            self.n_labelled_pixel += t[0].size

            # determine normalisation depending on int or float type
            if d.dtype.kind in ('u', 'i'):
                m = 255.
                d = np.ascontiguousarray(d, dtype=np.float32) / m

            if (np.dtype(self.t_dtype) is not np.dtype(t.dtype)) and \
                    self.t_dtype not in ['float32']:
                m = t.max()
                M = np.iinfo(self.t_dtype).max
                if m > M:
                    raise ValueError("Loading of data: targets must be cast "
                                     "to %s, but %s cannot store value %g, "
                                     "maximum allowed value: %g. You may try "
                                     "to renumber targets." % (self.t_dtype,
                                                               self.t_dtype, m,
                                                               M))
            if not self.h5stream:
                d = np.ascontiguousarray(d, dtype=np.float32)
                t = np.ascontiguousarray(t, dtype=self.t_dtype)

            pbar.write('Shapes: data %s, targets %s' % (d.shape, t.shape))

            data.append(d)
            target.append(t)
            gc.collect()
            pbar.update()

        return data, target, extras

"""
3d Label kinds:
k: number of classes
n_c: number of categories (each of which has n_k classes)

normal single category multiclass classification (e.g. bg,syn,mito,ves):
    (x,y,z)   encoded_targets==True, n_targets==k
    (1,x,y,z) encoded_targets==True, n_targets==k
    (k,x,y,z) encoded_targets==False, n_targets==k
multicategory multiclass classification (e.g. affinities):
    (n_c, x,y,z)  encoded_targets==True, n_targets=k
    (n_c*k,x,y,z) encoded_targets==False, n_targets=k,
single target regression (e.g. image reconstruction of gray img):
    (x,y,z)   encoded_targets indiff, n_targets=1
    (1,x,y,z) encoded_targets indiff, n_targets=1
multi target regression (e.g. image reconstruction of rgb img):
    (k,x,y,z) n_targets=k

--> l.shape[1] = ?
"""


##############################################################################################################
class DummySkel(object):
    def __init__(self):
        self.grad = np.array([0,0,0], dtype=np.float32)
        self.last_slicing_params = None
        self.lost_track = True # don't train sequentially on this
        self.debug_traces = []
        self.debug_traces_current = []
        self.debug_grads = []
        self.debug_grads_current = []

    def _grad(self, pred_zxy):
        # L= 1/2(x**2 + y**2)
        # dL/dp = [0, x, y]
        return np.array([0,pred_zxy[1],pred_zxy[2]], dtype=np.float32)

    def get_loss_and_gradient(self, prediction_c, **kwargs):
        loss = abs(prediction_c[1:]).sum() # sum of x,y magnitude
        loss = np.array([loss, ], dtype=np.float32)

        self.debug_traces_current.append(None)
        self.debug_grads_current.append(None)

        return loss, self._grad(prediction_c)

[docs]class AgentData(BatchCreatorImage):
    """
    Load raw_cube, vec_prob_obj_cube and skelfiles + rel.offset

    """
    def __init__(self, input_node, side_target_node, path_prefix=None,
                 raw_files=None, skel_files=None, vec_files=None, valid_skels=None,
                 target_vec_ix=None, target_discrete_ix=None,
                 abs_offset=None, aniso_factor=2):

        assert (path_prefix and raw_files and skel_files and vec_files and abs_offset)
        abs_offset = np.array(abs_offset, dtype=np.int)


        path_prefix = os.path.expanduser(path_prefix)
        self.path_prefix = path_prefix
        self.raw_files = raw_files
        self.skel_files = skel_files
        self.vec_files = vec_files
        self.valid_skels = valid_skels if valid_skels is not None else []
        self.abs_offset = abs_offset
        self.aniso_factor  = aniso_factor
        self.target_vec_ix = target_vec_ix
        self.target_discrete_ix = target_discrete_ix

        # Infer geometric info from input/target shapes
        self.ndim = input_node.shape.ndim
        self.patch_size = np.array(input_node.shape.spatial_shape, dtype=np.int)
        self.strides = np.array(side_target_node.shape.strides, dtype=np.int)
        self.offsets = np.array(side_target_node.shape.offsets, dtype=np.int)
        self.target_ps = self.patch_size - self.offsets * 2
        #self.n_ch_model = input_node.shape['f']
        self.n_f       = None # number of channels/feature in input
                              # will be inferred when reading data
        self.t_n_f  = None # the shape of the returned label batch at index 1
        self.t_dtype = side_target_node.output.dtype
        # Need to infer this when reading labels and with n_target/encoded_targets

        # Setup internal stuff
        self.rng = np.random.RandomState(np.uint32((time.time()*0.0001 -
                                            int(time.time()*0.0001))*4294967295))
        self.pid = os.getpid()
        self.gc_count = 1
        self.n_failed_warp = 0
        self.n_successful_warp = 0

        self._sampling_weight = None
        self._training_count  = None
        self._valid_count     = None

        # Actual data fields
        self.train_d = []
        self.train_l = []
        self.valid_s = []
        self.train_s = []

        assert len(raw_files)==len(skel_files)==len(vec_files)==1, "not supported atm, skeletons " \
                                                                   "from different cubes are concated, " \
                                                                   "only one abs offset"
        self.load_data()
        logger.info('{}\n'.format(self.__repr__()))

    def __repr__(self):
        s = "Data Set with {0:,d} input channel(s):\n"+\
        "#train skels: {1:,d} and #valid skels: {2:,d}"
        s = s.format(self.n_f, len(self.train_s),
                     len(self.valid_s))
        return s

    def _allocbatch(self, batch_size):
        images, target = super(AgentData, self)._allocbatch(batch_size)
        skel_i = np.zeros(batch_size, dtype='int16')
        slicing_params = np.zeros([batch_size,20], dtype='float32')
        return images, target, skel_i, slicing_params

[docs]    def getbatch(self, batch_size=1, source='train', aniso=True,
                  z_shift=0, gamma=0,
                  grey_augment_channels=None,
                  r_max_scale=0.9,
                  tracing_dir_prior_c=0.5,
                  force_dense=False, flatfield_p=1e-3):

        # This is especially required for multiprocessing
        if grey_augment_channels is None:
            grey_augment_channels = []
        self._reseed()
        images, target, skel_i, slicing_params = self._allocbatch(batch_size)
        patch_count = 0
        while patch_count < batch_size: # Loop to fill up batch with examples
            if (self.rng.rand() > flatfield_p) or source!='train':
                skel, i = self.getskel(source) # get skel randomly
                position_s = skel.sample_tube_point(self.rng, r_max_scale=r_max_scale)
                local_direc_is = skel.sample_local_direction_iso(position_s, n_neighbors=6)
                tracing_direc_is = skel.sample_tracing_direction_iso(self.rng, local_direc_is, c=tracing_dir_prior_c)

                position_l = position_s[::-1]  # from lab to data frame (xyz) -> (zxy)
                position_d = position_l - self.abs_offset
                tracing_direc_il = tracing_direc_is[::-1]         # from lab to data frame (xyz) -> (zxy)

                try:
                    if config.inspection > 2: # mark position by opaque cube
                        self.train_d[0][:,
                            position_d[0] - 1:position_d[0] + 2,
                            position_d[1] - 2:position_d[1] + 3,
                            position_d[2] - 2:position_d[2] + 3] = 1.0

                    image, vec, M = transformations.get_tracing_slice(
                    self.train_d[0], self.patch_size, position_d, z_shift=z_shift,
                    aniso_factor=self.aniso_factor, sample_aniso=aniso,
                    gamma=gamma, direction_iso=tracing_direc_il,
                    target=self.train_l[0], target_ps=self.target_ps,
                    target_vec_ix=self.target_vec_ix,
                    target_discrete_ix=self.target_discrete_ix, rng=self.rng)
                    self.n_successful_warp += 1

                    trafo = transformations.Transform(M, position_l=position_l,
                                                      aniso_factor=self.aniso_factor)

                except transformations.WarpingOOBError:
                    self.n_failed_warp += 1
                    continue

                if source == "train": # no grey augmentation for testing
                    image    = greyAugment(image, grey_augment_channels, self.rng)

            else:
                vec_sh = (self.train_l[0].shape[0],)+tuple(self.target_ps)
                vec = np.ones(vec_sh, dtype=self.t_dtype) * np.nan
                img_sh = (self.train_d[0].shape[0],)+tuple(self.patch_size)
                image = np.zeros(img_sh, dtype=np.float32)
                i = len(self.train_s) - 1
                M = np.eye(4)
                position_l = np.zeros(3, np.int)
                position_d = position_l
                local_direc_is = position_l
                tracing_direc_il = position_l
                trafo = transformations.Transform(M, position_l=position_l,
                                                  aniso_factor=self.aniso_factor)

            target[patch_count] = vec
            images[patch_count] = image
            skel_i[patch_count] = i
            slicing_params[patch_count] = trafo.to_array()
            patch_count += 1

        target[np.isnan(target)] = -666 # Stupid hack because theano does not support nan
        if not (force_dense or np.all(self.strides==1)):
            target = self._stridedtargets(target)

        ret = [images, target, skel_i, slicing_params]

        if config.inspection > 2:
            debug_img = self.train_d[0][:,
                        position_d[0] - 11:position_d[0] + 12,
                        position_d[1] - 21:position_d[1] + 22,
                        position_d[2] - 21:position_d[2] + 22]
            if i == len(self.train_s) - 1:
                i = "DummySkel"

            local_direc_il = local_direc_is[::-1]
            inspection_logger.info("skeleton: %s" % i)
            inspection_logger.info("pos_d: %s, pos_l: %s, ldir_il: %s, tdir_il: %s"
                                   % (position_d.tolist(),
                                      position_l.tolist(),
                                      local_direc_il.tolist(),
                                      tracing_direc_il.tolist()))

            inspection_logger.info("params: %s" % (trafo.to_array().tolist()))

            ret = [images, target, skel_i, slicing_params, debug_img]

        self.gc_count += 1
        if self.gc_count%1000==0:
            gc.collect()

        return tuple(ret)


[docs]    def get_newslice(self, position_l, direction_il, batch_size=1, source='train',
                     aniso=True, z_shift=0, gamma=0, grey_augment_channels=None,
                     r_max_scale=0.9, tracing_dir_prior_c=0.5, force_dense=False,
                     flatfield_p=1e-3, scale=1.0, last_ch_max_interp=False):
        assert batch_size==1
        if grey_augment_channels is None:
            grey_augment_channels = []
        if source!='train':
            raise ValueError

        images, target, skel_i, slicing_params = self._allocbatch(batch_size)

        position_d = position_l - self.abs_offset
        if config.inspection > 2:  # mark position by opaque cube
            self.train_d[0][:,
            position_d[0] - 2:position_d[0] + 3,
            position_d[1] - 4:position_d[1] + 5,
            position_d[2] - 4:position_d[2] + 5] = 1.0

        image, vec, M = transformations.get_tracing_slice(
            self.train_d[0], self.patch_size, position_d,
            z_shift=z_shift, aniso_factor=self.aniso_factor, sample_aniso=aniso,
            gamma=gamma, scale_factor=scale, direction_iso=direction_il,
            target=self.train_l[0], target_ps=self.target_ps,
            target_vec_ix=self.target_vec_ix,
            target_discrete_ix=self.target_discrete_ix, rng=self.rng,
            last_ch_max_interp=last_ch_max_interp)

        trafo = transformations.Transform(M, position_l=position_l,
                                           aniso_factor=self.aniso_factor)

        image = greyAugment(image, grey_augment_channels, self.rng)

        if vec is not None: # allow for using this with no vec data
            target[0] = vec

        images[0] = image

        target[np.isnan(target)] = -666  # Stupid hack because theano does not support nan
        if not (force_dense or np.all(self.strides==1)):
            target = self._stridedtargets(target)

        ret = [images, target, trafo]

        if config.inspection > 2:
            debug_img = self.train_d[0][:,
                        position_d[0] - 11:position_d[0] + 12,
                        position_d[1] - 21:position_d[1] + 22,
                        position_d[2] - 21:position_d[2] + 22]

            # inspection_logger.info("skeleton: 999")
            # inspection_logger.info("pos_d: %s, pos_l: %s, ldir_il: %s, tdir_il: %s"
            #                        % (position_d.tolist(),
            #                           position_l.tolist(),
            #                           direction_il.tolist(),
            #                           direction_il.tolist()))
            # inspection_logger.info("params: %s" % (trafo.to_array().tolist()))
            image_ref, _,_= transformations.get_tracing_slice(
                self.train_d[0], self.patch_size, position_d,
                z_shift=z_shift, aniso_factor=self.aniso_factor,
                sample_aniso=aniso,
                gamma=gamma, scale_factor=1.0, direction_iso=direction_il,
                target=self.train_l[0], target_ps=self.target_ps,
                target_vec_ix=self.target_vec_ix,
                target_discrete_ix=self.target_discrete_ix, rng=self.rng)
            i = np.random.randint(0, 1014)
            utils.h5save(image, '/tmp/%s_img%i-%.2f.h5'%(user_name, i,scale))
            utils.h5save(image_ref, '/tmp/%s_img%i-ref.h5' % (user_name, i))
            #utils.h5save(debug_img, '/tmp/dbg-img-%.2f.h5' % scale)
            ret = [images, target, trafo, debug_img]

        return tuple(ret)


[docs]    def getskel(self, source):
        """
        Draw an example skeleton according to sampling weight on training data,
        or randomly on valid data
        """
        if source=='train':
            p = self.rng.rand()
            i = np.flatnonzero(self._sampling_weight <= p)[-1]
            skel = self.train_s[i]
        elif source == "valid":
            if len(self.valid_s) == 0:
                logger.info("Validation Set empty. Disable testing on validation set.")
                return []
            i = self.rng.randint(0, len(self.valid_s))
            skel = self.valid_s[i]
        else:
            raise ValueError("Unknown data source")
        return skel, i


[docs]    def load_data(self):
        """
        Parameters
        ----------

        d_path/l_path: string
          Directories to load data from
        d_files/l_files: list
          List of data/target files in <path> directory (must be in the same order!).
          Each list element is a tuple in the form
          **(<Name of h5-file>, <Key of h5-dataset>)**
        cube_prios: list
          (not normalised) list of sampling weights to draw examples from
          the respective cubes. If None the cube sizes are taken as priorities.
        valid_cubes: list
          List of indices for cubes (from the file-lists) to use as validation
          data and exclude from training, may be empty list to skip performance
          estimation on validation data.
        """
        # returns lists of cubes, ll_mask is a tuple per cube
        data, vec, skeletons = self.read_files()
        self.train_d = data
        self.train_l = vec
        prios = []
        # Distribute Cubes into training and valid list
        for skel_list, valid_skels in zip(skeletons, self.valid_skels):
            for skel_i, skel in enumerate(skel_list):
                if skel_i in valid_skels:
                    self.valid_s.append(skel)
                else:
                    self.train_s.append(skel)
                    # If no priorities are given: sample proportional to cube size
                    prios.append(len(skel.all_nodes))


        prios = np.array(prios, dtype=np.float)
        # sample example i if: batch_prob[i] < p
        self._sampling_weight = np.hstack((0, np.cumsum(prios / prios.sum())))
        self.train_s.append(DummySkel())


[docs]    def read_files(self):
        """
        Image files on disk are expected to be in order (ch,x,y,z) or (x,y,z)
        But image stacks are returned as (z,ch,x,y) and target as (z,x,y,)
        irrespective of the order in the file. If the image files have no
        channel this dimension is extended to a singleton dimension.
        """
        logger.info("Loading Data")
        data, vec, skeletons = [], [], []

        for (d_f, d_key), (l_f, l_key), s_f in zip(self.raw_files, self.vec_files, self.skel_files):
            d = utils.h5load(os.path.join(self.path_prefix, d_f), d_key)
            if d.ndim == 4:
                self.n_f = d.shape[0]
            elif len(d.shape) == 3:  # We have no channels in data
                self.n_f = 1
                d = d[None]  # add (empty) 0-axis



            # determine normalisation depending on int or float type
            if d.dtype in [np.int, np.int8, np.int16, np.int32, np.uint32,
                           np.uint, np.uint8, np.uint16, np.uint32, np.uint32]:
                m = 255
            else:
                m  = 1

            d = np.ascontiguousarray(d, dtype=np.float32) / m

            try:
                t = utils.h5load(os.path.join(self.path_prefix, l_f), l_key)
                if len(t.shape) == 3: # If labels not empty add first axis
                    t = t[None]

                assert t.ndim==4
                self.t_n_f = t.shape[0]
                if (self.t_dtype is not t.dtype) and self.t_dtype not in ['float32']:
                    m = t.max()
                    M = np.iinfo(self.t_dtype).max
                    if m  > M:
                        raise ValueError("Loading of data: targets must be cast "
                                         "to %s, but %s cannot store value %g, "
                                         "maximum allowed value: %g. You may try "
                                         "to renumber targets." %(self.t_dtype,
                                                                 self.t_dtype, m, M))

                t = np.ascontiguousarray(t, dtype=self.t_dtype)
            except:
                t = None
                self.t_n_f = 9

            gc.collect()
            data.append(d)
            vec.append(t)
            s = utils.pickleload(os.path.join(self.path_prefix, s_f))
            skeletons.append(s)
            for skel in s: # backward fixes of attributes that are not pickled
                skel.lost_track = False
                skel.position_s = None
                skel.position_l = None
                skel.direction_il = None
                skel.start_new_training = True
                skel.prev_batch = None
                skel.trafo = None
                skel.training_traces = []
                skel.current_trace = None
                skel.linked_data = self
                skel._hull_point_bg = dict()
                skel.background_processes = config.background_processes
                skel.cnn_grid = None

        return data, vec, skeletons

[docs]class GridData(AgentData):
    def __init__(self, *args, **kwargs):
        super(GridData, self).__init__(*args, **kwargs)
        #self.mean_grid = utils.pickleload("~/CNN_Training/mean.pkl")

[docs]    def getbatch(self, **get_batch_kwargs):
        self._reseed()
        try:
            skel_example, skel_index = self.getskel('train')
            skel_example.start_new_training = True
            batch = skel_example.getbatch(0, 0.0, **get_batch_kwargs)
            img, target_img, target_grid, target_node= batch
            target_grid = target_grid[None]
            #target_grid -= self.mean_grid[None,None]
            self.last_skel = skel_example #DBG

            return img, target_grid
        except transformations.WarpingOOBError:
            return self.getbatch(**get_batch_kwargs)