a b/lvl1/genPreds_CNN_Tim.py
1
# -*- coding: utf-8 -*-
2
"""
3
Created on Thu Jul 30 21:28:32 2015.
4
5
Script written by Tim Hochberg with parameter tweaks by Bluefool.
6
https://www.kaggle.com/bitsofbits/grasp-and-lift-eeg-detection/naive-nnet
7
8
Modifications: rc, alex
9
"""
10
import os
11
import sys
12
if __name__ == '__main__' and __package__ is None:
13
    filePath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
14
    sys.path.append(filePath)
15
16
import yaml
17
from glob import glob
18
import numpy as np
19
import pandas as pd
20
from time import time
21
22
from sklearn.metrics import roc_auc_score
23
24
# Lasagne (& friends) imports
25
import theano
26
from nolearn.lasagne import BatchIterator, NeuralNet, TrainSplit
27
from lasagne.objectives import aggregate, binary_crossentropy
28
from lasagne.layers import (InputLayer, DropoutLayer, DenseLayer, Conv1DLayer,
29
                            Conv2DLayer)
30
from lasagne.updates import nesterov_momentum, adam
31
from theano.tensor.nnet import sigmoid
32
33
from mne import concatenate_raws, pick_types
34
35
from preprocessing.aux import creat_mne_raw_object
36
from preprocessing.filterBank import FilterBank
37
38
# Silence some warnings from lasagne
39
import warnings
40
warnings.filterwarnings('ignore', '.*topo.*')
41
warnings.filterwarnings('ignore', module='.*lasagne.init.*')
42
warnings.filterwarnings('ignore', module='.*nolearn.lasagne.*')
43
44
####
45
yml = yaml.load(open(sys.argv[1]))
46
fileName = yml['Meta']['file']
47
48
filt2Dsize = yml['filt2Dsize'] if 'filt2Dsize' in yml.keys() else 0
49
filters = yml['filters']
50
delay = yml['delay']
51
skip = yml['skip']
52
53
if 'bags' in yml.keys():
54
    bags = yml['bags']
55
else:
56
    bags = 3
57
58
mode = sys.argv[2]
59
if mode == 'val':
60
    test = False
61
elif mode == 'test':
62
    test = True
63
else:
64
    raise('Invalid mode. Please specify either val or test')
65
66
67
###########
68
SUBJECTS = list(range(1, 13))
69
TRAIN_SERIES = list(range(1, 9))
70
TEST_SERIES = [9, 10]
71
72
N_ELECTRODES = 32
73
N_EVENTS = 6
74
75
SAMPLE_SIZE = delay
76
DOWNSAMPLE = skip
77
TIME_POINTS = SAMPLE_SIZE // DOWNSAMPLE
78
79
TRAIN_SIZE = 5120
80
81
# We encapsulate the event / electrode data in a Source object.
82
83
84
def preprocessData(data):
85
    """Preprocess data with filterbank."""
86
    fb = FilterBank(filters)
87
    return fb.transform(data)
88
89
90
class Source:
91
92
    """Loads, preprocesses and holds data."""
93
94
    mean = None
95
    std = None
96
97
    def load_raw_data(self, subject, series):
98
        """Load data for a subject / series."""
99
        test = series == TEST_SERIES
100
        if not test:
101
            fnames = [glob('../data/train/subj%d_series%d_data.csv' %
102
                      (subject, i)) for i in series]
103
        else:
104
            fnames = [glob('../data/test/subj%d_series%d_data.csv' %
105
                      (subject, i)) for i in series]
106
        fnames = list(np.concatenate(fnames))
107
        fnames.sort()
108
        raw_train = [creat_mne_raw_object(fname, read_events=not test)
109
                     for fname in fnames]
110
        raw_train = concatenate_raws(raw_train)
111
        # pick eeg signal
112
        picks = pick_types(raw_train.info, eeg=True)
113
114
        self.data = raw_train._data[picks].transpose()
115
116
        self.data = preprocessData(self.data)
117
118
        if not test:
119
120
            self.events = raw_train._data[32:].transpose()
121
122
    def normalize(self):
123
        """normalize data."""
124
        self.data -= self.mean
125
        self.data /= self.std
126
127
128
class TrainSource(Source):
129
130
    """Source for training data."""
131
132
    def __init__(self, subject, series_list):
133
        """Init."""
134
        self.load_raw_data(subject, series_list)
135
        self.mean = self.data.mean(axis=0)
136
        self.std = self.data.std(axis=0)
137
        self.normalize()
138
139
140
# Note that Test/Submit sources use the mean/std from the training data.
141
# This is both standard practice and avoids using future data in theano
142
# test set.
143
144
class TestSource(Source):
145
146
    """Source for test data."""
147
148
    def __init__(self, subject, series, train_source):
149
        """Init."""
150
        self.load_raw_data(subject, series)
151
        self.mean = train_source.mean
152
        self.std = train_source.std
153
        self.normalize()
154
155
156
# Lay out the Neural net.
157
158
159
class LayerFactory:
160
161
    """Helper class that makes laying out Lasagne layers more pleasant."""
162
163
    def __init__(self):
164
        """Init."""
165
        self.layer_cnt = 0
166
        self.kwargs = {}
167
168
    def __call__(self, layer, layer_name=None, **kwargs):
169
        """Call."""
170
        self.layer_cnt += 1
171
        name = layer_name or "layer{0}".format(self.layer_cnt)
172
        for k, v in kwargs.items():
173
            self.kwargs["{0}_{1}".format(name, k)] = v
174
        return (name, layer)
175
176
177
class IndexBatchIterator(BatchIterator):
178
179
    """Generate BatchData from indices.
180
181
    Rather than passing the data into the fit function, instead we just pass in
182
    indices to the data.  The actual data is then grabbed from a Source object
183
    that is passed in at the creation of the IndexBatchIterator. Passing in a
184
    '-1' grabs a random value from the Source.
185
186
    As a result, an "epoch" here isn't a traditional epoch, which looks at all
187
    the time points. Instead a random subsamle of 0.8*TRAIN_SIZE points from
188
    the training data are used each "epoch" and 0.2 TRAIN_SIZE points are use
189
    for validation.
190
191
    """
192
193
    def __init__(self, source, *args, **kwargs):
194
        """Init."""
195
        super(IndexBatchIterator, self).__init__(*args, **kwargs)
196
        self.source = source
197
        if source is not None:
198
            # Tack on (SAMPLE_SIZE-1) copies of the first value so that it is
199
            # easy to grab
200
            # SAMPLE_SIZE POINTS even from the first location.
201
            x = source.data
202
            input_shape = [len(x) + (SAMPLE_SIZE - 1), N_ELECTRODES]
203
            self.augmented = np.zeros(input_shape, dtype=np.float32)
204
            self.augmented[SAMPLE_SIZE-1:] = x
205
            self.augmented[:SAMPLE_SIZE-1] = x[0]
206
        if filt2Dsize:
207
            input_shape = [self.batch_size, 1, N_ELECTRODES, TIME_POINTS]
208
            self.Xbuf = np.zeros(input_shape, np.float32)
209
        else:
210
            input_shape = [self.batch_size, N_ELECTRODES, TIME_POINTS]
211
            self.Xbuf = np.zeros(input_shape, np.float32)
212
        self.Ybuf = np.zeros([self.batch_size, N_EVENTS], np.float32)
213
214
    def transform(self, X_indices, y_indices):
215
        """Transform."""
216
        X_indices, y_indices = super(IndexBatchIterator,
217
                                     self).transform(X_indices, y_indices)
218
        [count] = X_indices.shape
219
        # Use preallocated space
220
        X = self.Xbuf[:count]
221
        Y = self.Ybuf[:count]
222
        for i, ndx in enumerate(X_indices):
223
            if ndx == -1:
224
                ndx = np.random.randint(len(self.source.events))
225
            sample = self.augmented[ndx:ndx+SAMPLE_SIZE]
226
            # Reverse so we get most recent point, otherwise downsampling drops
227
            # the last
228
            # DOWNSAMPLE-1 points.
229
            if filt2Dsize:
230
                X[i][0] = sample[::-1][::DOWNSAMPLE].transpose()
231
            else:
232
                X[i] = sample[::-1][::DOWNSAMPLE].transpose()
233
234
            if y_indices is not None:
235
                Y[i] = self.source.events[ndx]
236
        Y = None if (y_indices is None) else Y
237
        return X, Y
238
239
240
# Simple / Naive net. Borrows from Daniel Nouri's Facial Keypoint Detection
241
# Tutorial
242
243
def create_net(train_source, test_source, batch_size=128, max_epochs=100,
244
               train_val_split=False):
245
    """Create NN."""
246
    if train_val_split:
247
        train_val_split = TrainSplit(eval_size=0.2)
248
    else:
249
        train_val_split = TrainSplit(eval_size=False)
250
251
    batch_iter_train = IndexBatchIterator(train_source, batch_size=batch_size)
252
    batch_iter_test = IndexBatchIterator(test_source, batch_size=batch_size)
253
    LF = LayerFactory()
254
255
    dense = 1024  # larger (1024 perhaps) would be better
256
257
    if filt2Dsize:
258
        inputLayer = LF(InputLayer, shape=(None, 1, N_ELECTRODES, TIME_POINTS))
259
        convLayer = LF(Conv2DLayer, num_filters=8, filter_size=(N_ELECTRODES, filt2Dsize))
260
    else:
261
        inputLayer = LF(InputLayer, shape=(None, N_ELECTRODES, TIME_POINTS))
262
        convLayer = LF(Conv1DLayer, num_filters=8, filter_size=1)
263
264
    layers = [
265
        inputLayer,
266
        LF(DropoutLayer, p=0.5),
267
        convLayer,
268
        # Standard fully connected net from now on
269
        LF(DenseLayer, num_units=dense),
270
        LF(DropoutLayer, p=0.5),
271
        LF(DenseLayer, num_units=dense),
272
        LF(DropoutLayer, p=0.5),
273
        LF(DenseLayer, layer_name="output", num_units=N_EVENTS,
274
           nonlinearity=sigmoid)
275
    ]
276
277
    def loss(x, t):
278
        return aggregate(binary_crossentropy(x, t))
279
280
    if filt2Dsize:
281
        nnet = NeuralNet(y_tensor_type=theano.tensor.matrix,
282
                         layers=layers,
283
                         batch_iterator_train=batch_iter_train,
284
                         batch_iterator_test=batch_iter_test,
285
                         max_epochs=max_epochs,
286
                         verbose=0,
287
                         update=adam,
288
                         update_learning_rate=0.001,
289
                         objective_loss_function=loss,
290
                         regression=True,
291
                         train_split=train_val_split,
292
                         **LF.kwargs)
293
    else:
294
        nnet = NeuralNet(y_tensor_type=theano.tensor.matrix,
295
                         layers=layers,
296
                         batch_iterator_train=batch_iter_train,
297
                         batch_iterator_test=batch_iter_test,
298
                         max_epochs=max_epochs,
299
                         verbose=0,
300
                         update=nesterov_momentum,
301
                         update_learning_rate=0.02,
302
                         update_momentum=0.9,
303
                         # update=adam,
304
                         # update_learning_rate=0.001,
305
                         objective_loss_function=loss,
306
                         regression=True,
307
                         train_split=train_val_split,
308
                         **LF.kwargs)
309
310
    return nnet
311
312
313
# Do the training.
314
print 'Running in mode %s, saving to file %s' % (mode,fileName)
315
report = pd.DataFrame(index=[fileName])
316
start_time = time()
317
318
train_indices = np.zeros([TRAIN_SIZE], dtype=int) - 1
319
320
np.random.seed(67534)
321
322
valid_series = [7, 8]
323
max_epochs = 100
324
325
if test is False:
326
    probs_bags = []
327
    for bag in range(bags):
328
        probs_tot = []
329
        lbls_tot = []
330
        for subject in range(1, 13):
331
            tseries = sorted(set(TRAIN_SERIES) - set(valid_series))
332
            train_source = TrainSource(subject, tseries)
333
            test_source = TestSource(subject, valid_series, train_source)
334
            net = create_net(train_source, test_source, max_epochs=max_epochs,
335
                             train_val_split=False)
336
            dummy = net.fit(train_indices, train_indices)
337
            indices = np.arange(len(test_source.data))
338
            probs = net.predict_proba(indices)
339
    
340
            auc = np.mean([roc_auc_score(trueVals, p) for trueVals, p in
341
                          zip(test_source.events.T, probs.T)])
342
            print 'Bag %d, subject %d, AUC: %.5f' % (bag, subject, auc)
343
            probs_tot.append(probs)
344
            lbls_tot.append(test_source.events)
345
    
346
        probs_tot = np.concatenate(probs_tot)
347
        lbls_tot = np.concatenate(lbls_tot)
348
        auc = np.mean([roc_auc_score(trueVals, p) for trueVals, p in
349
                      zip(lbls_tot.transpose(), probs_tot.transpose())])
350
        print auc
351
        probs_bags.append(probs_tot)
352
    
353
    probs_bags = np.mean(probs_bags, axis=0)
354
    np.save('val/val_%s.npy' % fileName, [probs_bags])
355
356
else:
357
    probs_bags = []
358
    for bag in range(bags):
359
        probs_tot = []
360
        for subject in range(1, 13):
361
            tseries = set(TRAIN_SERIES)
362
            train_source = TrainSource(subject, tseries)
363
            test_source = TestSource(subject, TEST_SERIES, train_source)
364
            net = create_net(train_source, test_source, max_epochs=max_epochs,
365
                             train_val_split=False)
366
            dummy = net.fit(train_indices, train_indices)
367
            indices = np.arange(len(test_source.data))
368
            probs = net.predict_proba(indices)
369
            print 'Bag %d, subject %d' % (bag, subject)
370
            probs_tot.append(probs)
371
        probs_tot = np.concatenate(probs_tot)
372
        probs_bags.append(probs_tot)
373
    
374
    probs_bags = np.mean(probs_bags, axis=0)
375
    np.save('test/test_%s.npy' % fileName, [probs_bags])
376
377
378
prefix = 'test_' if test else 'val_'
379
end_time = time()
380
report['Time'] = end_time - start_time
381
report.to_csv("report/%s_%s.csv" % (prefix, fileName))
382
print report