--- a +++ b/ensembling/NeuralNet.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- +""" +Created on Sat Aug 15 18:18:11 2015 + +@author: rc, alex +""" + +import numpy as np +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.metrics import roc_auc_score + +from progressbar import Bar, ETA, Percentage, ProgressBar, RotatingMarker + +from preprocessing.aux import delay_preds, delay_preds_2d +from utils.nn import buildNN + + +class NeuralNet(BaseEstimator,ClassifierMixin): + + """ Ensembling with a Neural Network """ + + def __init__(self,ensemble,architecture,training_params, + partsTrain=1,partsTest=1, + delay=4000,skip=100,jump=None,subsample=1, + smallEpochs=2,majorEpochs=20,checkEveryEpochs=2, + verbose=True): + """Init.""" + ### timecourse history parameters ### + # how many past time samples to include along with the most recent sample + self.delay = delay + # subsample above samples + self.skip = skip + # here can be set a custom subsampling scheme, it overrides previous params + self.jump = jump + + ### RAM saving ### + # due to RAM limitations the model is interchangeably trained on 'partsTrain' portions of the data + self.partsTrain = partsTrain + # also due to RAM limitations testing data has to be split into 'partsTest' parts + self.partsTest = partsTest + + ### training ### + # amounts of epochs to perform on the current portion of the training data + self.smallEpochs = smallEpochs + # amounts of major epochs to perform, + # i.e. on each major epoch a new portion of training data is obtained + self.majorEpochs = majorEpochs + # print AUC computed on test set every major epochs + self.checkEveryEpochs = checkEveryEpochs + + # whether to calculate and print results during training + self.verbose = verbose + + # used in bagging to set different starting points when subsampling the data + self.mdlNr = 0 + + self.subsample = subsample + + self.architecture = architecture + self.ensemble = ensemble + self.training_params = training_params + + def fit(self,X,y,Xtest=None,ytest=None): + """Fit.""" + input_dim = X.shape[1] + # set different data preparation schemes basing on what kind of NN is it + layers = [i.keys()[0] for i in self.architecture] + self.isCNN = 'Conv' in layers + self.isRecurrent = 'GRU' in layers or 'LSTM' in layers + if self.isCNN: + self.addDelay = delay_preds + self.training_params['num_strides'] = self.delay//self.skip + elif self.isRecurrent: + self.addDelay = delay_preds_2d + else: + input_dim *= self.delay/self.skip + input_dim = int( input_dim ) + self.addDelay = delay_preds + + # create the model + self.model = buildNN(self.architecture, self.training_params, input_dim) + + widgets = ['Training : ', Percentage(), ' ', Bar(marker=RotatingMarker()), + ' ', ETA(), ' '] + pbar = ProgressBar(widgets=widgets, maxval=self.majorEpochs) + pbar.start() + + # train the model on a portion of training data; that portion is changed each majorEpoch + for majorEpoch in range(self.majorEpochs): + startingPoint = majorEpoch%self.partsTrain or self.mdlNr%self.partsTrain + if self.jump is not None: + trainData = self.addDelay(X, delay=self.delay, skip=self.skip, + subsample=self.partsTrain,start=startingPoint, jump=self.jump) + else: + trainData = self.addDelay(X, delay=self.delay, skip=self.skip, + subsample=self.partsTrain,start=startingPoint) + + if self.isCNN: + trainData = trainData.reshape((trainData.shape[0],1,trainData.shape[1],1)) + targets = y[startingPoint::self.partsTrain] + + trainData = trainData[::self.subsample] + targets = targets[::self.subsample] + + self.model.fit(trainData, targets, nb_epoch=self.smallEpochs, + batch_size=512,verbose=0,show_accuracy=True) + + trainData=None + + pbar.update(majorEpoch) + + if self.verbose and majorEpoch%self.checkEveryEpochs == 0: + print("Total epochs: %d" % (self.smallEpochs*(majorEpoch+1))) + if Xtest is not None and ytest is not None: + pred = self._predict_proba_train(Xtest) + score = np.mean(roc_auc_score(ytest[0::self.partsTest],pred)) + print("Test AUC : %.5f" % (score)) + pred = None + + if self.verbose: + print('Training finished after %d epochs'% (self.smallEpochs*(majorEpoch+1))) + + def predict_proba(self,X): + """Get predictions.""" + pred = [] + for part in range(self.partsTest): + start = part*len(X)//self.partsTest-self.delay*(part>0) + stop = (part+1)*len(X)//self.partsTest + testData = self.addDelay(X[slice(start,stop)], delay=self.delay, skip=self.skip, + jump=self.jump)[self.delay*(part>0):] + if self.isCNN: + testData = testData.reshape((testData.shape[0],1,testData.shape[1],1)) + pred.append(self.model.predict_proba(testData, batch_size=512,verbose=0)) + testData = None + pred = np.concatenate(pred) + return pred + + def _predict_proba_train(self,X): + """ Only used internally during training - subsamples test data for speed """ + testData = self.addDelay(X, delay=self.delay, skip=self.skip,subsample=self.partsTest,start=0,jump=self.jump) + if self.isCNN: + testData = testData.reshape((testData.shape[0],1,testData.shape[1],1)) + pred = self.model.predict_proba(testData, batch_size=512,verbose=0) + testData = None + return pred \ No newline at end of file