Diff of /ensembling/NeuralNet.py [000000] .. [21363a]

Switch to unified view

a b/ensembling/NeuralNet.py
1
# -*- coding: utf-8 -*-
2
"""
3
Created on Sat Aug 15 18:18:11 2015
4
5
@author: rc, alex
6
"""
7
8
import numpy as np
9
from sklearn.base  import BaseEstimator, ClassifierMixin
10
from sklearn.metrics import roc_auc_score
11
12
from progressbar import Bar, ETA, Percentage, ProgressBar, RotatingMarker
13
14
from preprocessing.aux import delay_preds, delay_preds_2d
15
from utils.nn import buildNN
16
17
18
class NeuralNet(BaseEstimator,ClassifierMixin):
19
    
20
    """ Ensembling with a Neural Network """
21
    
22
    def __init__(self,ensemble,architecture,training_params,
23
                 partsTrain=1,partsTest=1,
24
                 delay=4000,skip=100,jump=None,subsample=1,
25
                 smallEpochs=2,majorEpochs=20,checkEveryEpochs=2,
26
                 verbose=True):
27
        """Init."""
28
        ### timecourse history parameters ###
29
        # how many past time samples to include along with the most recent sample
30
        self.delay = delay
31
        # subsample above samples
32
        self.skip = skip
33
        # here can be set a custom subsampling scheme, it overrides previous params
34
        self.jump = jump
35
        
36
        ### RAM saving ###
37
        # due to RAM limitations the model is interchangeably trained on 'partsTrain' portions of the data
38
        self.partsTrain = partsTrain
39
        # also due to RAM limitations testing data has to be split into 'partsTest' parts
40
        self.partsTest = partsTest
41
        
42
        ### training ###
43
        # amounts of epochs to perform on the current portion of the training data
44
        self.smallEpochs = smallEpochs
45
        # amounts of major epochs to perform, 
46
        # i.e. on each major epoch a new portion of training data is obtained
47
        self.majorEpochs = majorEpochs
48
        # print AUC computed on test set every major epochs
49
        self.checkEveryEpochs = checkEveryEpochs
50
        
51
        # whether to calculate and print results during training
52
        self.verbose = verbose
53
        
54
        # used in bagging to set different starting points when subsampling the data
55
        self.mdlNr = 0
56
        
57
        self.subsample = subsample
58
        
59
        self.architecture = architecture
60
        self.ensemble = ensemble
61
        self.training_params = training_params
62
    
63
    def fit(self,X,y,Xtest=None,ytest=None):
64
        """Fit."""
65
        input_dim = X.shape[1]
66
        # set different data preparation schemes basing on what kind of NN is it
67
        layers = [i.keys()[0] for i in self.architecture]
68
        self.isCNN = 'Conv' in layers
69
        self.isRecurrent = 'GRU' in layers or 'LSTM' in layers        
70
        if self.isCNN:
71
            self.addDelay = delay_preds
72
            self.training_params['num_strides'] = self.delay//self.skip
73
        elif self.isRecurrent:
74
            self.addDelay = delay_preds_2d
75
        else:
76
            input_dim *= self.delay/self.skip
77
            input_dim = int( input_dim )
78
            self.addDelay = delay_preds
79
        
80
        # create the model
81
        self.model = buildNN(self.architecture, self.training_params, input_dim)
82
            
83
        widgets = ['Training : ', Percentage(), ' ', Bar(marker=RotatingMarker()),
84
           ' ', ETA(), ' ']
85
        pbar = ProgressBar(widgets=widgets, maxval=self.majorEpochs)
86
        pbar.start()
87
            
88
        # train the model on a portion of training data; that portion is changed each majorEpoch
89
        for majorEpoch in range(self.majorEpochs):
90
            startingPoint = majorEpoch%self.partsTrain or self.mdlNr%self.partsTrain
91
            if self.jump is not None:
92
                trainData = self.addDelay(X, delay=self.delay, skip=self.skip,
93
                                          subsample=self.partsTrain,start=startingPoint, jump=self.jump)
94
            else:
95
                trainData = self.addDelay(X, delay=self.delay, skip=self.skip,
96
                                          subsample=self.partsTrain,start=startingPoint)
97
                                         
98
            if self.isCNN:
99
                trainData = trainData.reshape((trainData.shape[0],1,trainData.shape[1],1))
100
            targets = y[startingPoint::self.partsTrain]
101
            
102
            trainData = trainData[::self.subsample]
103
            targets = targets[::self.subsample]
104
            
105
            self.model.fit(trainData, targets, nb_epoch=self.smallEpochs, 
106
                           batch_size=512,verbose=0,show_accuracy=True)
107
            
108
            trainData=None
109
            
110
            pbar.update(majorEpoch)
111
            
112
            if self.verbose and majorEpoch%self.checkEveryEpochs == 0:
113
                print("Total epochs: %d" % (self.smallEpochs*(majorEpoch+1)))
114
                if Xtest is not None and ytest is not None:
115
                    pred = self._predict_proba_train(Xtest)
116
                    score = np.mean(roc_auc_score(ytest[0::self.partsTest],pred))
117
                    print("Test AUC : %.5f" % (score))
118
                    pred = None
119
        
120
        if self.verbose:
121
            print('Training finished after %d epochs'% (self.smallEpochs*(majorEpoch+1)))
122
        
123
    def predict_proba(self,X):
124
        """Get predictions."""
125
        pred = []
126
        for part in range(self.partsTest):
127
            start = part*len(X)//self.partsTest-self.delay*(part>0)
128
            stop = (part+1)*len(X)//self.partsTest
129
            testData = self.addDelay(X[slice(start,stop)], delay=self.delay, skip=self.skip, 
130
                                       jump=self.jump)[self.delay*(part>0):]
131
            if self.isCNN:
132
                testData = testData.reshape((testData.shape[0],1,testData.shape[1],1))
133
            pred.append(self.model.predict_proba(testData, batch_size=512,verbose=0))
134
            testData = None
135
        pred = np.concatenate(pred)
136
        return pred
137
        
138
    def _predict_proba_train(self,X):
139
        """ Only used internally during training - subsamples test data for speed """
140
        testData = self.addDelay(X, delay=self.delay, skip=self.skip,subsample=self.partsTest,start=0,jump=self.jump)
141
        if self.isCNN:
142
            testData = testData.reshape((testData.shape[0],1,testData.shape[1],1))
143
        pred = self.model.predict_proba(testData, batch_size=512,verbose=0)
144
        testData = None
145
        return pred