Source code for elektronn.training.traindata

# -*- coding: utf-8 -*-
# ELEKTRONN - Neural Network Toolkit
#
# Copyright (c) 2014 - now
# Max-Planck-Institute for Medical Research, Heidelberg, Germany
# Authors: Marius Killinger, Gregor Urban

import numpy as np

import urllib2
import cPickle
import os, time, re

try:
    sklearn_avail = True
    from sklearn import cross_validation
except:
    sklearn_avail = False

import warping
import trainutils as ut


[docs]def sort_human(file_names): """ Sort the given list in the way that humans expect.""" convert = lambda text: int(text) if text.isdigit() else text alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] file_names.sort(key=alphanum_key)
[docs]class Data(object): """ Load and prepare data, Base-Obj """ def __init__(self, n_lab=None): self._pos = 0 if isinstance(self.train_d, np.ndarray): self._training_count = self.train_d.shape[0] if n_lab is None: self.n_lab = np.unique(self.train_l).size else: self.n_lab = n_lab elif isinstance(self.train_d, list): self._training_count = len(self.train_d) if n_lab is None: unique = [np.unique(l) for l in self.train_l] self.n_lab = np.unique(np.hstack(unique)).size else: self.n_lab = n_lab self.example_shape = self.train_d[0].shape self.n_ch = self.example_shape[0] self.rng = np.random.RandomState(np.uint32((time.time() * 0.0001 - int(time.time() * 0.0001)) * 4294967295)) self.pid = os.getpid() print self.__repr__() self._perm = self.rng.permutation(self._training_count) def _reseed(self): """Reseeds the rng if the process ID has changed!""" current_pid = os.getpid() if current_pid != self.pid: self.pid = current_pid self.rng.seed(np.uint32((time.time() * 0.0001 - int(time.time() * 0.0001)) * 4294967295 + self.pid)) print "Reseeding RNG in Process with PID:", self.pid def __repr__(self): return "%i-class Data Set: #training examples: %i and #validing: %i" \ % (self.n_lab, self._training_count, len(self.valid_d))
[docs] def getbatch(self, batch_size, source='train'): if source == 'train': if (self._pos + batch_size) < self._training_count: self._pos += batch_size slice = self._perm[self._pos - batch_size:self._pos] else: # get new permutation self._perm = self.rng.permutation(self._training_count) self._pos = 0 slice = self._perm[:batch_size] if isinstance(self.train_d, np.ndarray): return (self.train_d[slice], self.train_l[slice]) elif isinstance(self.train_d, list): data = np.array([self.train_d[i] for i in slice]) label = np.array([self.train_l[i] for i in slice]) return (data, label) elif source == 'valid': data = self.valid_d[:batch_size] label = self.valid_l[:batch_size] return (data, label) elif source == 'test': data = self.test_d[:batch_size] label = self.test_l[:batch_size] return (data, label)
[docs] def createSplitPerm(self, size, subset_ratio=0.8, seed=None): rng = np.random.RandomState(np.uint32((time.time()))) if seed is not None: rng.seed(np.uint32(seed)) perm = rng.permutation(size) k = int(size * subset_ratio) return perm[:k]
[docs] def createCVSplit(self, data, label, n_folds=3, use_fold=2, shuffle=False, random_state=None): if not sklearn_avail: raise RuntimeError("Please install sklearn to create CV splits") cv = cross_validation.KFold(len(data), n_folds, shuffle=shuffle, random_state=random_state) for fold, (train_i, valid_i) in enumerate(cv): if fold == use_fold: self.valid_d = data[valid_i] self.valid_l = label[valid_i] self.train_d = data[train_i] self.train_l = label[train_i]
[docs] def splitData(self, data, label, valid_size, split_no=0): rng = np.random.RandomState(split_no) perm = rng.permutation(len(data)) self.valid_d = data[perm[:valid_size]] self.valid_l = label[perm[:valid_size]] self.train_d = data[perm[valid_size:]] self.train_l = label[perm[valid_size:]]
[docs] def makeExampleSubset(self, subset_ratio=0.8, seed=None): ix = self.createSplitPerm(len(self.train_d), subset_ratio, seed) self.train_d = self.train_d[ix] self.train_l = self.train_l[ix] self._training_count = len(self.train_d) self._perm = self.rng.permutation(self._training_count)
[docs] def makeFeatureSubset(self, subset_ratio=0.8, seed=None): ix = self.createSplitPerm(self.train_d.shape[1], subset_ratio, seed) self.train_d = self.train_d[:, ix] self.valid_d = self.valid_d[:, ix] self.test_d = self.test_d[:, ix]
##############################################################################################################
[docs]class BalancedData(Data): def __init__(self): super(BalancedData, self).__init__() self._splitted = False
[docs] def getbatch(self, batch_size, source='train', balanced=False): if balanced: return self.getbatch_balanced(batch_size) # only train else: return super(BalancedData, self).getbatch(batch_size, source=source)
def _init_balanced(self): self._b_mask = [(None)] * self.n_lab self._b_count = np.empty((self.n_lab), dtype=np.int) self._b_pos = np.zeros((self.n_lab), dtype=np.int) self._b_perm = [(None)] * self.n_lab for k in xrange(self.n_lab): mask = np.flatnonzero(self.train_l == k) self._b_mask[k] = mask self._b_count[k] = len(mask) self._b_perm[k] = self.rng.permutation(self._b_count[k]) self._splitted = True def _get_save_slice(self, perm, pos, batch_size): if (pos + batch_size) < len(perm): pos = pos + batch_size return perm[pos - batch_size:pos], pos else: slice = perm[pos:] pos = batch_size - len(slice) slice = np.hstack((slice, perm[:pos])) return slice, 0 # need to shuffle
[docs] def getbatch_balanced(self, batch_size): if not self._splitted: self._init_balanced() data = np.empty((batch_size, self.train_d.shape[1]), dtype=np.float32) label = np.empty((batch_size), dtype=np.int16) batch_size = batch_size // self.n_lab for k, (mask, perm, pos) in enumerate(zip(self._b_mask, self._b_perm, self._b_pos)): slice, pos = self._get_save_slice(perm, pos, batch_size) d = self.train_d[mask[slice]] l = self.train_l[mask[slice]] self._b_pos[k] = pos if pos == 0: self._b_perm[k] = self.rng.permutation(self._b_count[k]) self._b_pos[k] = 0 data[k * batch_size:(k + 1) * batch_size] = d label[k * batch_size:(k + 1) * batch_size] = l return data, label
##############################################################################################################
[docs]class QueueData(Data): def __init__(self): super(QueueData, self).__init__() ### Queue stuff ### self._queue_prio = np.zeros(self._training_count) self._queue_last = np.zeros(self._training_count) self._queue_ix = np.arange(self._training_count) self._queue_count = np.ones(self._training_count) # Initialise to one s.t. 1/count is possible self._batch_ix = None
[docs] def queueget(self, n): assert self._batch_ix is None, "Update Priorities first before requesting new batch" slice = self._queue_ix[:n] # indices of n highest elements self._batch_ix = slice # store for updates if isinstance(self.train_d, np.ndarray): ret = (self.train_d[slice], self.train_l[slice], self._queue_count[slice]) elif isinstance(self.train_d, list): data = [self.train_d[i] for i in slice] label = [self.train_l[i] for i in slice] count = [self._queue_count[i] for i in slice] ret = (data, label, count) self._queue_count[slice] += 1 return ret
[docs] def queueupdate(self, nlls, iteration): assert self._batch_ix.shape == nlls.shape, "Cannot update, indices not known" self._queue_prio[self._batch_ix] = nlls/(1+0.017*self._queue_count[self._batch_ix]) \ -0.004*(iteration - self._queue_last[self._batch_ix]) # for i,nll in zip(self._batch_ix, nlls): # Update priorities in original list # p = nll/(1+0.017*self._queue_count[i]) - 0.004*(iteration - self._queue_last[i]) # self._queue_prio[i] = p self._queue_ix = np.argsort(self._queue_prio)[::-1] # restore order, high prios first self._batch_ix = None
[docs] def queuereset(self): self._queue_prio = np.zeros(self._training_count) self._queue_ix = np.arange(self._training_count) self._queue_count = np.ones(self._training_count) # Initialise to one s.t. 1/count is possible self._batch_ix = None
def _getdata(self): return (self.train_d, self.train_l, self.valid_d, self.valid_l) # def swapSets(self): # train_d, train_l, valid_d, valid_l = self._getdata() # self.train_d, self.train_l, self.valid_d, self.valid_l = valid_d, valid_l, train_d, train_l, # self._training_count = self.train_d.shape[0] # self._perm = np.arange(self._training_count) # self._queue_prio = np.zeros(self._training_count) # self._queue_last = np.zeros(self._training_count) # self._queue_ix = np.arange(self._training_count) # self._queue_count= np.ones(self._training_count) # Initialise to one s.t. 1/count is possible
[docs] def save(self, path="data"): f = file(path, 'w') if isinstance(self.train_d, np.ndarray): cPickle.dump(self._getdata(), f, protocol=2) elif isinstance(self.train_d, list): dat = self.valid_d + self.train_d lab = self.valid_l + self.train_l for (d, l) in zip(dat, lab): cPickle.dump(d, f, protocol=2) cPickle.dump(l, f, protocol=2) f.close()
### Toy Data Sets ############################################################################################
[docs]class AdultData(Data): def __init__(self, path='~/devel/data/adult.pkl', create=False): path = os.path.expanduser(path) if create: self._fields = 'age,workclass,fnlwgt,education,educationnum,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,nativecountry,target'.split(',') self._kinds = 'cont,cat,cat,cat,cont,cat,cat,cat,cat,cat,cont,cont,cont,cat,cat'.split(',') data_socket = urllib2.urlopen('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data') train_d = np.genfromtxt(data_socket, skip_header=1, delimiter=',', names=self._fields, dtype=None) train_d = self._normalise_adult(train_d) self.train_l = train_d[:, -1].astype('int16') # np.expand_dims(train_d[:,-1].astype('int16'), 1) self.train_d = train_d[:, :-1] test_socket = urllib2.urlopen('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test') valid_d = np.genfromtxt(test_socket, skip_header=1, delimiter=',', names=self._fields, dtype=None) valid_d = self._normalise_adult(valid_d) self.valid_l = valid_d[:, -1].astype('int16') #np.expand_dims(valid_d[:,-1].astype('int16'), 1) self.valid_d = valid_d[:, :-1] ut.pickleSave((self.train_d, self.train_l, self.valid_d, self.valid_l), path) else: self.train_d, self.train_l, self.valid_d, self.valid_l = ut.pickleLoad(path) super(AdultData, self).__init__() def _normalise_adult(self, data): ret = np.zeros((data.size, len(self._fields)), dtype='float32') for i, (name, kind) in enumerate(zip(self._fields, self._kinds)): if kind == 'cat': unique = np.unique(data[name]) for k, val in enumerate(unique): ret[(data[name] == val), i] = k ret[:, i] *= 1 / ret[:, i].max() elif kind == 'cont': ret[:, i] = data[name].astype('float32') ret[:, i] *= 1 / ret[:, i].max() else: print name + ' has no kind specified' return ret
##########################################################################################
[docs]class MNISTData(Data): def __init__(self, path=None, convert2image=True, warp_on=False, shift_augment=True, center=True): if path is None: (self.train_d, self.train_l), (self.valid_d, self.valid_l), (self.test_d, self.test_l) = self.download() else: path = os.path.expanduser(path) (self.train_d, self.train_l), (self.valid_d, self.valid_l), (self.test_d, self.test_l) = ut.pickleLoad(path) self.warp_on = warp_on self.shif_augment = shift_augment self.return_flat = not convert2image self.test_l = self.test_l.astype(np.int16) self.train_l = self.train_l.astype(np.int16) self.valid_l = self.valid_l.astype(np.int16) if center: self.test_d -= self.test_d.mean() self.train_d -= self.train_d.mean() self.valid_d -= self.valid_d.mean() self.convert_to_image() if self.shif_augment: self._stripborder(1) self.train_d, self.train_l = self._augmentMNIST(self.train_d, self.train_l, crop=2, factor=4) super(MNISTData, self).__init__() if not convert2image: self.example_shape = self.train_d[0].size print "MNIST data is converted/augmented to shape", self.example_shape
[docs] def download(self): if os.name == 'nt': dest = os.path.join(os.environ['APPDATA'], 'ELEKTRONN') else: dest = os.path.join(os.path.expanduser('~'), '.ELEKTRONN') if not os.path.exists(dest): os.makedirs(dest) dest = os.path.join(dest, 'mnist.pkl.gz') if os.path.exists(dest): print "Found existing mnist data" return ut.pickleLoad(dest) else: print "Downloading mnist data from" print "http://www.elektronn.org/downloads/mnist.pkl.gz" f = urllib2.urlopen("http://www.elektronn.org/downloads/mnist.pkl.gz") data = f.read() print "Saving data to %s" %(dest,) with open(dest, "wb") as code: code.write(data) return ut.pickleLoad(dest)
[docs] def convert_to_image(self): """For MNIST / flattened 2d, single-layer, square images""" valid_size = self.valid_l.size test_size = self.test_l.size data = np.vstack((self.valid_d, self.test_d, self.train_d)) size = data[0].size n = int(np.sqrt(size)) assert abs(n**2 - size) < 1e-6, '<convertToImage> data is not square' count = data.shape[0] data = data.reshape((count, 1, n, n)) self.valid_d = data[:valid_size] self.test_d = data[valid_size:valid_size + test_size] self.train_d = data[valid_size + test_size:]
[docs] def getbatch(self, batch_size, source='train'): if source == 'valid': ret = super(MNISTData, self).getbatch(batch_size, 'valid') if source == 'test': ret = super(MNISTData, self).getbatch(batch_size, 'test') else: d, l = super(MNISTData, self).getbatch(batch_size, source) if self.warp_on: d = self._warpaugment(d) ret = d, l if self.return_flat: ret = (ret[0].reshape((batch_size, -1)), ret[1]) return ret
def _stripborder(self, pix=1): s = self.train_d.shape[-1] self.valid_d = self.valid_d[:, :, pix:s - pix, pix:s - pix] self.test_d = self.test_d[:, :, pix:s - pix, pix:s - pix] def _warpaugment(self, d, amount=1): rot_max = 5 * amount shear_max = 7 * amount scale_max = 1.15 * amount stretch_max = 0.25 * amount shear = shear_max * 2 * (np.random.rand() - 0.5) twist = rot_max * 2 * (np.random.rand() - 0.5) rot = 0 # min(rot_max - abs(twist), rot_max * (np.random.rand())) scale = 1 + (scale_max - 1) * np.random.rand(2) stretch = stretch_max * 2 * (np.random.rand(4) - 0.5) ps = (d.shape[0], ) + d.shape[2:] w = warping.warp3dFast(d, ps, rot, shear, (scale[0], scale[1], 1), stretch, twist) return w def _augmentMNIST(self, data, label, crop=2, factor=4): """ Creates new data, by cropping/shifting data. Control blow-up by factor and maximum offset by crop """ n = data.shape[-1] new_size = (n - crop) new_data = np.zeros((0, 1, new_size, new_size), dtype=np.float32) # store new data in here new_label = np.zeros((0, ), dtype=np.int16) pos = [(i % crop, int(i / crop) % crop) for i in xrange(crop**2)] # offests of different positions perm = np.random.permutation(xrange(crop**2)) for i in xrange(factor): # create <factor> new version of data ix = pos[perm[i]] new = (data[:, :, ix[0]:ix[0] + new_size, ix[1]:ix[1] + new_size]) new_data = np.concatenate((new_data, new), axis=0) new_label = np.concatenate((new_label, label), axis=0) return new_data, new_label
[docs]class BuzzData(Data): def __init__(self, path='~/devel/data/Buzz/Twitter/twitter.pkl', norm_targets=True, target_scale=9999, fold_no=0): path = os.path.expanduser(path) data, target = ut.pickleLoad(path) # N = len(data) # data = data.reshape((N, -1)) # data = data[:8000] # target = target[:8000] if norm_targets: target /= target.max() if target_scale is not None: target = np.log10(target * target_scale + 1) super(BuzzData, self).createCVSplit(data, target, use_fold=fold_no) super(BuzzData, self).__init__() self.example_shape = data.shape[-1] self.n_taps = data.shape[-2] self.n_lab = 1
[docs] def getbatch(self, batch_size, source='train'): d, l = super(BuzzData, self).getbatch(batch_size, source=source) l = l[:, None] return d, l
[docs]class PianoData(Data): def __init__(self, path='~/devel/data/PianoRoll/Nottingham_enc.pkl', n_tap=20, n_lab=58): path = os.path.expanduser(path) (self.train_d, self.valid_d, self.test_d) = ut.pickleLoad(path) super(PianoData, self).__init__(n_lab=n_lab) self.example_shape = self.train_d[0].shape[-1] self.n_taps = n_tap self.n_lab = n_lab
[docs] def getbatch(self, batch_size, source='train'): if source == 'train': if (self._pos + batch_size) < self._training_count: self._pos += batch_size slice = self._perm[self._pos - batch_size:self._pos] else: # get new permutation self._perm = self.rng.permutation(self._training_count) self._pos = 0 slice = self._perm[:batch_size] data = [self.train_d[i] for i in slice] elif source == 'valid': data = self.valid_d[:batch_size] elif source == 'test': data = self.test_d[:batch_size] lengths = np.array(map(len, data)) start_t = np.round(np.random.rand(batch_size) * (lengths - self.n_taps - 1)).astype(np.int) x = np.array([d[t:t + self.n_taps].astype(np.float32) for d, t in zip(data, start_t)]) y = np.array([d[t + self.n_taps] for d, t in zip(data, start_t)]) return x, y
[docs]class GeneData(Data): def __init__(self, path='~/devel/data/GEMLeR_GeneExpression/Breast_Colon.pkl', fold_no=0): path = os.path.expanduser(path) data, target = ut.pickleLoad(path) super(GeneData, self).createCVSplit(data, target, use_fold=fold_no) super(GeneData, self).__init__() self.example_shape = data.shape[-1] self.n_lab = 1
[docs] def getbatch(self, batch_size, source='train'): d, l = super(GeneData, self).getbatch(batch_size, source=source) l = l[:, None] return d, l
if __name__ == "__main__": # from matplotlib import pyplot as plt # from Net.introspection import embedMatricesInGray # data = MNISTData( warp_on=True) # d, l = data.getbatch(200, 'train') # m = embedMatricesInGray(d[:,0]) # plt.imshow(m, interpolation='none', cmap='gray') data = MNISTData(path=None, convert2image=False, shift_augment=False) # data = PianoData(n_tap=20, n_lab=58) d, l = data.getbatch(10) data = AdultData()