[21363a]: / lvl1 / genPreds_CNN_Tim.py

Download this file

383 lines (304 with data), 12.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 30 21:28:32 2015.
Script written by Tim Hochberg with parameter tweaks by Bluefool.
https://www.kaggle.com/bitsofbits/grasp-and-lift-eeg-detection/naive-nnet
Modifications: rc, alex
"""
import os
import sys
if __name__ == '__main__' and __package__ is None:
filePath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(filePath)
import yaml
from glob import glob
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import roc_auc_score
# Lasagne (& friends) imports
import theano
from nolearn.lasagne import BatchIterator, NeuralNet, TrainSplit
from lasagne.objectives import aggregate, binary_crossentropy
from lasagne.layers import (InputLayer, DropoutLayer, DenseLayer, Conv1DLayer,
Conv2DLayer)
from lasagne.updates import nesterov_momentum, adam
from theano.tensor.nnet import sigmoid
from mne import concatenate_raws, pick_types
from preprocessing.aux import creat_mne_raw_object
from preprocessing.filterBank import FilterBank
# Silence some warnings from lasagne
import warnings
warnings.filterwarnings('ignore', '.*topo.*')
warnings.filterwarnings('ignore', module='.*lasagne.init.*')
warnings.filterwarnings('ignore', module='.*nolearn.lasagne.*')
####
yml = yaml.load(open(sys.argv[1]))
fileName = yml['Meta']['file']
filt2Dsize = yml['filt2Dsize'] if 'filt2Dsize' in yml.keys() else 0
filters = yml['filters']
delay = yml['delay']
skip = yml['skip']
if 'bags' in yml.keys():
bags = yml['bags']
else:
bags = 3
mode = sys.argv[2]
if mode == 'val':
test = False
elif mode == 'test':
test = True
else:
raise('Invalid mode. Please specify either val or test')
###########
SUBJECTS = list(range(1, 13))
TRAIN_SERIES = list(range(1, 9))
TEST_SERIES = [9, 10]
N_ELECTRODES = 32
N_EVENTS = 6
SAMPLE_SIZE = delay
DOWNSAMPLE = skip
TIME_POINTS = SAMPLE_SIZE // DOWNSAMPLE
TRAIN_SIZE = 5120
# We encapsulate the event / electrode data in a Source object.
def preprocessData(data):
"""Preprocess data with filterbank."""
fb = FilterBank(filters)
return fb.transform(data)
class Source:
"""Loads, preprocesses and holds data."""
mean = None
std = None
def load_raw_data(self, subject, series):
"""Load data for a subject / series."""
test = series == TEST_SERIES
if not test:
fnames = [glob('../data/train/subj%d_series%d_data.csv' %
(subject, i)) for i in series]
else:
fnames = [glob('../data/test/subj%d_series%d_data.csv' %
(subject, i)) for i in series]
fnames = list(np.concatenate(fnames))
fnames.sort()
raw_train = [creat_mne_raw_object(fname, read_events=not test)
for fname in fnames]
raw_train = concatenate_raws(raw_train)
# pick eeg signal
picks = pick_types(raw_train.info, eeg=True)
self.data = raw_train._data[picks].transpose()
self.data = preprocessData(self.data)
if not test:
self.events = raw_train._data[32:].transpose()
def normalize(self):
"""normalize data."""
self.data -= self.mean
self.data /= self.std
class TrainSource(Source):
"""Source for training data."""
def __init__(self, subject, series_list):
"""Init."""
self.load_raw_data(subject, series_list)
self.mean = self.data.mean(axis=0)
self.std = self.data.std(axis=0)
self.normalize()
# Note that Test/Submit sources use the mean/std from the training data.
# This is both standard practice and avoids using future data in theano
# test set.
class TestSource(Source):
"""Source for test data."""
def __init__(self, subject, series, train_source):
"""Init."""
self.load_raw_data(subject, series)
self.mean = train_source.mean
self.std = train_source.std
self.normalize()
# Lay out the Neural net.
class LayerFactory:
"""Helper class that makes laying out Lasagne layers more pleasant."""
def __init__(self):
"""Init."""
self.layer_cnt = 0
self.kwargs = {}
def __call__(self, layer, layer_name=None, **kwargs):
"""Call."""
self.layer_cnt += 1
name = layer_name or "layer{0}".format(self.layer_cnt)
for k, v in kwargs.items():
self.kwargs["{0}_{1}".format(name, k)] = v
return (name, layer)
class IndexBatchIterator(BatchIterator):
"""Generate BatchData from indices.
Rather than passing the data into the fit function, instead we just pass in
indices to the data. The actual data is then grabbed from a Source object
that is passed in at the creation of the IndexBatchIterator. Passing in a
'-1' grabs a random value from the Source.
As a result, an "epoch" here isn't a traditional epoch, which looks at all
the time points. Instead a random subsamle of 0.8*TRAIN_SIZE points from
the training data are used each "epoch" and 0.2 TRAIN_SIZE points are use
for validation.
"""
def __init__(self, source, *args, **kwargs):
"""Init."""
super(IndexBatchIterator, self).__init__(*args, **kwargs)
self.source = source
if source is not None:
# Tack on (SAMPLE_SIZE-1) copies of the first value so that it is
# easy to grab
# SAMPLE_SIZE POINTS even from the first location.
x = source.data
input_shape = [len(x) + (SAMPLE_SIZE - 1), N_ELECTRODES]
self.augmented = np.zeros(input_shape, dtype=np.float32)
self.augmented[SAMPLE_SIZE-1:] = x
self.augmented[:SAMPLE_SIZE-1] = x[0]
if filt2Dsize:
input_shape = [self.batch_size, 1, N_ELECTRODES, TIME_POINTS]
self.Xbuf = np.zeros(input_shape, np.float32)
else:
input_shape = [self.batch_size, N_ELECTRODES, TIME_POINTS]
self.Xbuf = np.zeros(input_shape, np.float32)
self.Ybuf = np.zeros([self.batch_size, N_EVENTS], np.float32)
def transform(self, X_indices, y_indices):
"""Transform."""
X_indices, y_indices = super(IndexBatchIterator,
self).transform(X_indices, y_indices)
[count] = X_indices.shape
# Use preallocated space
X = self.Xbuf[:count]
Y = self.Ybuf[:count]
for i, ndx in enumerate(X_indices):
if ndx == -1:
ndx = np.random.randint(len(self.source.events))
sample = self.augmented[ndx:ndx+SAMPLE_SIZE]
# Reverse so we get most recent point, otherwise downsampling drops
# the last
# DOWNSAMPLE-1 points.
if filt2Dsize:
X[i][0] = sample[::-1][::DOWNSAMPLE].transpose()
else:
X[i] = sample[::-1][::DOWNSAMPLE].transpose()
if y_indices is not None:
Y[i] = self.source.events[ndx]
Y = None if (y_indices is None) else Y
return X, Y
# Simple / Naive net. Borrows from Daniel Nouri's Facial Keypoint Detection
# Tutorial
def create_net(train_source, test_source, batch_size=128, max_epochs=100,
train_val_split=False):
"""Create NN."""
if train_val_split:
train_val_split = TrainSplit(eval_size=0.2)
else:
train_val_split = TrainSplit(eval_size=False)
batch_iter_train = IndexBatchIterator(train_source, batch_size=batch_size)
batch_iter_test = IndexBatchIterator(test_source, batch_size=batch_size)
LF = LayerFactory()
dense = 1024 # larger (1024 perhaps) would be better
if filt2Dsize:
inputLayer = LF(InputLayer, shape=(None, 1, N_ELECTRODES, TIME_POINTS))
convLayer = LF(Conv2DLayer, num_filters=8, filter_size=(N_ELECTRODES, filt2Dsize))
else:
inputLayer = LF(InputLayer, shape=(None, N_ELECTRODES, TIME_POINTS))
convLayer = LF(Conv1DLayer, num_filters=8, filter_size=1)
layers = [
inputLayer,
LF(DropoutLayer, p=0.5),
convLayer,
# Standard fully connected net from now on
LF(DenseLayer, num_units=dense),
LF(DropoutLayer, p=0.5),
LF(DenseLayer, num_units=dense),
LF(DropoutLayer, p=0.5),
LF(DenseLayer, layer_name="output", num_units=N_EVENTS,
nonlinearity=sigmoid)
]
def loss(x, t):
return aggregate(binary_crossentropy(x, t))
if filt2Dsize:
nnet = NeuralNet(y_tensor_type=theano.tensor.matrix,
layers=layers,
batch_iterator_train=batch_iter_train,
batch_iterator_test=batch_iter_test,
max_epochs=max_epochs,
verbose=0,
update=adam,
update_learning_rate=0.001,
objective_loss_function=loss,
regression=True,
train_split=train_val_split,
**LF.kwargs)
else:
nnet = NeuralNet(y_tensor_type=theano.tensor.matrix,
layers=layers,
batch_iterator_train=batch_iter_train,
batch_iterator_test=batch_iter_test,
max_epochs=max_epochs,
verbose=0,
update=nesterov_momentum,
update_learning_rate=0.02,
update_momentum=0.9,
# update=adam,
# update_learning_rate=0.001,
objective_loss_function=loss,
regression=True,
train_split=train_val_split,
**LF.kwargs)
return nnet
# Do the training.
print 'Running in mode %s, saving to file %s' % (mode,fileName)
report = pd.DataFrame(index=[fileName])
start_time = time()
train_indices = np.zeros([TRAIN_SIZE], dtype=int) - 1
np.random.seed(67534)
valid_series = [7, 8]
max_epochs = 100
if test is False:
probs_bags = []
for bag in range(bags):
probs_tot = []
lbls_tot = []
for subject in range(1, 13):
tseries = sorted(set(TRAIN_SERIES) - set(valid_series))
train_source = TrainSource(subject, tseries)
test_source = TestSource(subject, valid_series, train_source)
net = create_net(train_source, test_source, max_epochs=max_epochs,
train_val_split=False)
dummy = net.fit(train_indices, train_indices)
indices = np.arange(len(test_source.data))
probs = net.predict_proba(indices)
auc = np.mean([roc_auc_score(trueVals, p) for trueVals, p in
zip(test_source.events.T, probs.T)])
print 'Bag %d, subject %d, AUC: %.5f' % (bag, subject, auc)
probs_tot.append(probs)
lbls_tot.append(test_source.events)
probs_tot = np.concatenate(probs_tot)
lbls_tot = np.concatenate(lbls_tot)
auc = np.mean([roc_auc_score(trueVals, p) for trueVals, p in
zip(lbls_tot.transpose(), probs_tot.transpose())])
print auc
probs_bags.append(probs_tot)
probs_bags = np.mean(probs_bags, axis=0)
np.save('val/val_%s.npy' % fileName, [probs_bags])
else:
probs_bags = []
for bag in range(bags):
probs_tot = []
for subject in range(1, 13):
tseries = set(TRAIN_SERIES)
train_source = TrainSource(subject, tseries)
test_source = TestSource(subject, TEST_SERIES, train_source)
net = create_net(train_source, test_source, max_epochs=max_epochs,
train_val_split=False)
dummy = net.fit(train_indices, train_indices)
indices = np.arange(len(test_source.data))
probs = net.predict_proba(indices)
print 'Bag %d, subject %d' % (bag, subject)
probs_tot.append(probs)
probs_tot = np.concatenate(probs_tot)
probs_bags.append(probs_tot)
probs_bags = np.mean(probs_bags, axis=0)
np.save('test/test_%s.npy' % fileName, [probs_bags])
prefix = 'test_' if test else 'val_'
end_time = time()
report['Time'] = end_time - start_time
report.to_csv("report/%s_%s.csv" % (prefix, fileName))
print report