|
a |
|
b/lvl1/genPreds_CNN_Tim.py |
|
|
1 |
# -*- coding: utf-8 -*- |
|
|
2 |
""" |
|
|
3 |
Created on Thu Jul 30 21:28:32 2015. |
|
|
4 |
|
|
|
5 |
Script written by Tim Hochberg with parameter tweaks by Bluefool. |
|
|
6 |
https://www.kaggle.com/bitsofbits/grasp-and-lift-eeg-detection/naive-nnet |
|
|
7 |
|
|
|
8 |
Modifications: rc, alex |
|
|
9 |
""" |
|
|
10 |
import os |
|
|
11 |
import sys |
|
|
12 |
if __name__ == '__main__' and __package__ is None: |
|
|
13 |
filePath = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
|
14 |
sys.path.append(filePath) |
|
|
15 |
|
|
|
16 |
import yaml |
|
|
17 |
from glob import glob |
|
|
18 |
import numpy as np |
|
|
19 |
import pandas as pd |
|
|
20 |
from time import time |
|
|
21 |
|
|
|
22 |
from sklearn.metrics import roc_auc_score |
|
|
23 |
|
|
|
24 |
# Lasagne (& friends) imports |
|
|
25 |
import theano |
|
|
26 |
from nolearn.lasagne import BatchIterator, NeuralNet, TrainSplit |
|
|
27 |
from lasagne.objectives import aggregate, binary_crossentropy |
|
|
28 |
from lasagne.layers import (InputLayer, DropoutLayer, DenseLayer, Conv1DLayer, |
|
|
29 |
Conv2DLayer) |
|
|
30 |
from lasagne.updates import nesterov_momentum, adam |
|
|
31 |
from theano.tensor.nnet import sigmoid |
|
|
32 |
|
|
|
33 |
from mne import concatenate_raws, pick_types |
|
|
34 |
|
|
|
35 |
from preprocessing.aux import creat_mne_raw_object |
|
|
36 |
from preprocessing.filterBank import FilterBank |
|
|
37 |
|
|
|
38 |
# Silence some warnings from lasagne |
|
|
39 |
import warnings |
|
|
40 |
warnings.filterwarnings('ignore', '.*topo.*') |
|
|
41 |
warnings.filterwarnings('ignore', module='.*lasagne.init.*') |
|
|
42 |
warnings.filterwarnings('ignore', module='.*nolearn.lasagne.*') |
|
|
43 |
|
|
|
44 |
#### |
|
|
45 |
yml = yaml.load(open(sys.argv[1])) |
|
|
46 |
fileName = yml['Meta']['file'] |
|
|
47 |
|
|
|
48 |
filt2Dsize = yml['filt2Dsize'] if 'filt2Dsize' in yml.keys() else 0 |
|
|
49 |
filters = yml['filters'] |
|
|
50 |
delay = yml['delay'] |
|
|
51 |
skip = yml['skip'] |
|
|
52 |
|
|
|
53 |
if 'bags' in yml.keys(): |
|
|
54 |
bags = yml['bags'] |
|
|
55 |
else: |
|
|
56 |
bags = 3 |
|
|
57 |
|
|
|
58 |
mode = sys.argv[2] |
|
|
59 |
if mode == 'val': |
|
|
60 |
test = False |
|
|
61 |
elif mode == 'test': |
|
|
62 |
test = True |
|
|
63 |
else: |
|
|
64 |
raise('Invalid mode. Please specify either val or test') |
|
|
65 |
|
|
|
66 |
|
|
|
67 |
########### |
|
|
68 |
SUBJECTS = list(range(1, 13)) |
|
|
69 |
TRAIN_SERIES = list(range(1, 9)) |
|
|
70 |
TEST_SERIES = [9, 10] |
|
|
71 |
|
|
|
72 |
N_ELECTRODES = 32 |
|
|
73 |
N_EVENTS = 6 |
|
|
74 |
|
|
|
75 |
SAMPLE_SIZE = delay |
|
|
76 |
DOWNSAMPLE = skip |
|
|
77 |
TIME_POINTS = SAMPLE_SIZE // DOWNSAMPLE |
|
|
78 |
|
|
|
79 |
TRAIN_SIZE = 5120 |
|
|
80 |
|
|
|
81 |
# We encapsulate the event / electrode data in a Source object. |
|
|
82 |
|
|
|
83 |
|
|
|
84 |
def preprocessData(data): |
|
|
85 |
"""Preprocess data with filterbank.""" |
|
|
86 |
fb = FilterBank(filters) |
|
|
87 |
return fb.transform(data) |
|
|
88 |
|
|
|
89 |
|
|
|
90 |
class Source: |
|
|
91 |
|
|
|
92 |
"""Loads, preprocesses and holds data.""" |
|
|
93 |
|
|
|
94 |
mean = None |
|
|
95 |
std = None |
|
|
96 |
|
|
|
97 |
def load_raw_data(self, subject, series): |
|
|
98 |
"""Load data for a subject / series.""" |
|
|
99 |
test = series == TEST_SERIES |
|
|
100 |
if not test: |
|
|
101 |
fnames = [glob('../data/train/subj%d_series%d_data.csv' % |
|
|
102 |
(subject, i)) for i in series] |
|
|
103 |
else: |
|
|
104 |
fnames = [glob('../data/test/subj%d_series%d_data.csv' % |
|
|
105 |
(subject, i)) for i in series] |
|
|
106 |
fnames = list(np.concatenate(fnames)) |
|
|
107 |
fnames.sort() |
|
|
108 |
raw_train = [creat_mne_raw_object(fname, read_events=not test) |
|
|
109 |
for fname in fnames] |
|
|
110 |
raw_train = concatenate_raws(raw_train) |
|
|
111 |
# pick eeg signal |
|
|
112 |
picks = pick_types(raw_train.info, eeg=True) |
|
|
113 |
|
|
|
114 |
self.data = raw_train._data[picks].transpose() |
|
|
115 |
|
|
|
116 |
self.data = preprocessData(self.data) |
|
|
117 |
|
|
|
118 |
if not test: |
|
|
119 |
|
|
|
120 |
self.events = raw_train._data[32:].transpose() |
|
|
121 |
|
|
|
122 |
def normalize(self): |
|
|
123 |
"""normalize data.""" |
|
|
124 |
self.data -= self.mean |
|
|
125 |
self.data /= self.std |
|
|
126 |
|
|
|
127 |
|
|
|
128 |
class TrainSource(Source): |
|
|
129 |
|
|
|
130 |
"""Source for training data.""" |
|
|
131 |
|
|
|
132 |
def __init__(self, subject, series_list): |
|
|
133 |
"""Init.""" |
|
|
134 |
self.load_raw_data(subject, series_list) |
|
|
135 |
self.mean = self.data.mean(axis=0) |
|
|
136 |
self.std = self.data.std(axis=0) |
|
|
137 |
self.normalize() |
|
|
138 |
|
|
|
139 |
|
|
|
140 |
# Note that Test/Submit sources use the mean/std from the training data. |
|
|
141 |
# This is both standard practice and avoids using future data in theano |
|
|
142 |
# test set. |
|
|
143 |
|
|
|
144 |
class TestSource(Source): |
|
|
145 |
|
|
|
146 |
"""Source for test data.""" |
|
|
147 |
|
|
|
148 |
def __init__(self, subject, series, train_source): |
|
|
149 |
"""Init.""" |
|
|
150 |
self.load_raw_data(subject, series) |
|
|
151 |
self.mean = train_source.mean |
|
|
152 |
self.std = train_source.std |
|
|
153 |
self.normalize() |
|
|
154 |
|
|
|
155 |
|
|
|
156 |
# Lay out the Neural net. |
|
|
157 |
|
|
|
158 |
|
|
|
159 |
class LayerFactory: |
|
|
160 |
|
|
|
161 |
"""Helper class that makes laying out Lasagne layers more pleasant.""" |
|
|
162 |
|
|
|
163 |
def __init__(self): |
|
|
164 |
"""Init.""" |
|
|
165 |
self.layer_cnt = 0 |
|
|
166 |
self.kwargs = {} |
|
|
167 |
|
|
|
168 |
def __call__(self, layer, layer_name=None, **kwargs): |
|
|
169 |
"""Call.""" |
|
|
170 |
self.layer_cnt += 1 |
|
|
171 |
name = layer_name or "layer{0}".format(self.layer_cnt) |
|
|
172 |
for k, v in kwargs.items(): |
|
|
173 |
self.kwargs["{0}_{1}".format(name, k)] = v |
|
|
174 |
return (name, layer) |
|
|
175 |
|
|
|
176 |
|
|
|
177 |
class IndexBatchIterator(BatchIterator): |
|
|
178 |
|
|
|
179 |
"""Generate BatchData from indices. |
|
|
180 |
|
|
|
181 |
Rather than passing the data into the fit function, instead we just pass in |
|
|
182 |
indices to the data. The actual data is then grabbed from a Source object |
|
|
183 |
that is passed in at the creation of the IndexBatchIterator. Passing in a |
|
|
184 |
'-1' grabs a random value from the Source. |
|
|
185 |
|
|
|
186 |
As a result, an "epoch" here isn't a traditional epoch, which looks at all |
|
|
187 |
the time points. Instead a random subsamle of 0.8*TRAIN_SIZE points from |
|
|
188 |
the training data are used each "epoch" and 0.2 TRAIN_SIZE points are use |
|
|
189 |
for validation. |
|
|
190 |
|
|
|
191 |
""" |
|
|
192 |
|
|
|
193 |
def __init__(self, source, *args, **kwargs): |
|
|
194 |
"""Init.""" |
|
|
195 |
super(IndexBatchIterator, self).__init__(*args, **kwargs) |
|
|
196 |
self.source = source |
|
|
197 |
if source is not None: |
|
|
198 |
# Tack on (SAMPLE_SIZE-1) copies of the first value so that it is |
|
|
199 |
# easy to grab |
|
|
200 |
# SAMPLE_SIZE POINTS even from the first location. |
|
|
201 |
x = source.data |
|
|
202 |
input_shape = [len(x) + (SAMPLE_SIZE - 1), N_ELECTRODES] |
|
|
203 |
self.augmented = np.zeros(input_shape, dtype=np.float32) |
|
|
204 |
self.augmented[SAMPLE_SIZE-1:] = x |
|
|
205 |
self.augmented[:SAMPLE_SIZE-1] = x[0] |
|
|
206 |
if filt2Dsize: |
|
|
207 |
input_shape = [self.batch_size, 1, N_ELECTRODES, TIME_POINTS] |
|
|
208 |
self.Xbuf = np.zeros(input_shape, np.float32) |
|
|
209 |
else: |
|
|
210 |
input_shape = [self.batch_size, N_ELECTRODES, TIME_POINTS] |
|
|
211 |
self.Xbuf = np.zeros(input_shape, np.float32) |
|
|
212 |
self.Ybuf = np.zeros([self.batch_size, N_EVENTS], np.float32) |
|
|
213 |
|
|
|
214 |
def transform(self, X_indices, y_indices): |
|
|
215 |
"""Transform.""" |
|
|
216 |
X_indices, y_indices = super(IndexBatchIterator, |
|
|
217 |
self).transform(X_indices, y_indices) |
|
|
218 |
[count] = X_indices.shape |
|
|
219 |
# Use preallocated space |
|
|
220 |
X = self.Xbuf[:count] |
|
|
221 |
Y = self.Ybuf[:count] |
|
|
222 |
for i, ndx in enumerate(X_indices): |
|
|
223 |
if ndx == -1: |
|
|
224 |
ndx = np.random.randint(len(self.source.events)) |
|
|
225 |
sample = self.augmented[ndx:ndx+SAMPLE_SIZE] |
|
|
226 |
# Reverse so we get most recent point, otherwise downsampling drops |
|
|
227 |
# the last |
|
|
228 |
# DOWNSAMPLE-1 points. |
|
|
229 |
if filt2Dsize: |
|
|
230 |
X[i][0] = sample[::-1][::DOWNSAMPLE].transpose() |
|
|
231 |
else: |
|
|
232 |
X[i] = sample[::-1][::DOWNSAMPLE].transpose() |
|
|
233 |
|
|
|
234 |
if y_indices is not None: |
|
|
235 |
Y[i] = self.source.events[ndx] |
|
|
236 |
Y = None if (y_indices is None) else Y |
|
|
237 |
return X, Y |
|
|
238 |
|
|
|
239 |
|
|
|
240 |
# Simple / Naive net. Borrows from Daniel Nouri's Facial Keypoint Detection |
|
|
241 |
# Tutorial |
|
|
242 |
|
|
|
243 |
def create_net(train_source, test_source, batch_size=128, max_epochs=100, |
|
|
244 |
train_val_split=False): |
|
|
245 |
"""Create NN.""" |
|
|
246 |
if train_val_split: |
|
|
247 |
train_val_split = TrainSplit(eval_size=0.2) |
|
|
248 |
else: |
|
|
249 |
train_val_split = TrainSplit(eval_size=False) |
|
|
250 |
|
|
|
251 |
batch_iter_train = IndexBatchIterator(train_source, batch_size=batch_size) |
|
|
252 |
batch_iter_test = IndexBatchIterator(test_source, batch_size=batch_size) |
|
|
253 |
LF = LayerFactory() |
|
|
254 |
|
|
|
255 |
dense = 1024 # larger (1024 perhaps) would be better |
|
|
256 |
|
|
|
257 |
if filt2Dsize: |
|
|
258 |
inputLayer = LF(InputLayer, shape=(None, 1, N_ELECTRODES, TIME_POINTS)) |
|
|
259 |
convLayer = LF(Conv2DLayer, num_filters=8, filter_size=(N_ELECTRODES, filt2Dsize)) |
|
|
260 |
else: |
|
|
261 |
inputLayer = LF(InputLayer, shape=(None, N_ELECTRODES, TIME_POINTS)) |
|
|
262 |
convLayer = LF(Conv1DLayer, num_filters=8, filter_size=1) |
|
|
263 |
|
|
|
264 |
layers = [ |
|
|
265 |
inputLayer, |
|
|
266 |
LF(DropoutLayer, p=0.5), |
|
|
267 |
convLayer, |
|
|
268 |
# Standard fully connected net from now on |
|
|
269 |
LF(DenseLayer, num_units=dense), |
|
|
270 |
LF(DropoutLayer, p=0.5), |
|
|
271 |
LF(DenseLayer, num_units=dense), |
|
|
272 |
LF(DropoutLayer, p=0.5), |
|
|
273 |
LF(DenseLayer, layer_name="output", num_units=N_EVENTS, |
|
|
274 |
nonlinearity=sigmoid) |
|
|
275 |
] |
|
|
276 |
|
|
|
277 |
def loss(x, t): |
|
|
278 |
return aggregate(binary_crossentropy(x, t)) |
|
|
279 |
|
|
|
280 |
if filt2Dsize: |
|
|
281 |
nnet = NeuralNet(y_tensor_type=theano.tensor.matrix, |
|
|
282 |
layers=layers, |
|
|
283 |
batch_iterator_train=batch_iter_train, |
|
|
284 |
batch_iterator_test=batch_iter_test, |
|
|
285 |
max_epochs=max_epochs, |
|
|
286 |
verbose=0, |
|
|
287 |
update=adam, |
|
|
288 |
update_learning_rate=0.001, |
|
|
289 |
objective_loss_function=loss, |
|
|
290 |
regression=True, |
|
|
291 |
train_split=train_val_split, |
|
|
292 |
**LF.kwargs) |
|
|
293 |
else: |
|
|
294 |
nnet = NeuralNet(y_tensor_type=theano.tensor.matrix, |
|
|
295 |
layers=layers, |
|
|
296 |
batch_iterator_train=batch_iter_train, |
|
|
297 |
batch_iterator_test=batch_iter_test, |
|
|
298 |
max_epochs=max_epochs, |
|
|
299 |
verbose=0, |
|
|
300 |
update=nesterov_momentum, |
|
|
301 |
update_learning_rate=0.02, |
|
|
302 |
update_momentum=0.9, |
|
|
303 |
# update=adam, |
|
|
304 |
# update_learning_rate=0.001, |
|
|
305 |
objective_loss_function=loss, |
|
|
306 |
regression=True, |
|
|
307 |
train_split=train_val_split, |
|
|
308 |
**LF.kwargs) |
|
|
309 |
|
|
|
310 |
return nnet |
|
|
311 |
|
|
|
312 |
|
|
|
313 |
# Do the training. |
|
|
314 |
print 'Running in mode %s, saving to file %s' % (mode,fileName) |
|
|
315 |
report = pd.DataFrame(index=[fileName]) |
|
|
316 |
start_time = time() |
|
|
317 |
|
|
|
318 |
train_indices = np.zeros([TRAIN_SIZE], dtype=int) - 1 |
|
|
319 |
|
|
|
320 |
np.random.seed(67534) |
|
|
321 |
|
|
|
322 |
valid_series = [7, 8] |
|
|
323 |
max_epochs = 100 |
|
|
324 |
|
|
|
325 |
if test is False: |
|
|
326 |
probs_bags = [] |
|
|
327 |
for bag in range(bags): |
|
|
328 |
probs_tot = [] |
|
|
329 |
lbls_tot = [] |
|
|
330 |
for subject in range(1, 13): |
|
|
331 |
tseries = sorted(set(TRAIN_SERIES) - set(valid_series)) |
|
|
332 |
train_source = TrainSource(subject, tseries) |
|
|
333 |
test_source = TestSource(subject, valid_series, train_source) |
|
|
334 |
net = create_net(train_source, test_source, max_epochs=max_epochs, |
|
|
335 |
train_val_split=False) |
|
|
336 |
dummy = net.fit(train_indices, train_indices) |
|
|
337 |
indices = np.arange(len(test_source.data)) |
|
|
338 |
probs = net.predict_proba(indices) |
|
|
339 |
|
|
|
340 |
auc = np.mean([roc_auc_score(trueVals, p) for trueVals, p in |
|
|
341 |
zip(test_source.events.T, probs.T)]) |
|
|
342 |
print 'Bag %d, subject %d, AUC: %.5f' % (bag, subject, auc) |
|
|
343 |
probs_tot.append(probs) |
|
|
344 |
lbls_tot.append(test_source.events) |
|
|
345 |
|
|
|
346 |
probs_tot = np.concatenate(probs_tot) |
|
|
347 |
lbls_tot = np.concatenate(lbls_tot) |
|
|
348 |
auc = np.mean([roc_auc_score(trueVals, p) for trueVals, p in |
|
|
349 |
zip(lbls_tot.transpose(), probs_tot.transpose())]) |
|
|
350 |
print auc |
|
|
351 |
probs_bags.append(probs_tot) |
|
|
352 |
|
|
|
353 |
probs_bags = np.mean(probs_bags, axis=0) |
|
|
354 |
np.save('val/val_%s.npy' % fileName, [probs_bags]) |
|
|
355 |
|
|
|
356 |
else: |
|
|
357 |
probs_bags = [] |
|
|
358 |
for bag in range(bags): |
|
|
359 |
probs_tot = [] |
|
|
360 |
for subject in range(1, 13): |
|
|
361 |
tseries = set(TRAIN_SERIES) |
|
|
362 |
train_source = TrainSource(subject, tseries) |
|
|
363 |
test_source = TestSource(subject, TEST_SERIES, train_source) |
|
|
364 |
net = create_net(train_source, test_source, max_epochs=max_epochs, |
|
|
365 |
train_val_split=False) |
|
|
366 |
dummy = net.fit(train_indices, train_indices) |
|
|
367 |
indices = np.arange(len(test_source.data)) |
|
|
368 |
probs = net.predict_proba(indices) |
|
|
369 |
print 'Bag %d, subject %d' % (bag, subject) |
|
|
370 |
probs_tot.append(probs) |
|
|
371 |
probs_tot = np.concatenate(probs_tot) |
|
|
372 |
probs_bags.append(probs_tot) |
|
|
373 |
|
|
|
374 |
probs_bags = np.mean(probs_bags, axis=0) |
|
|
375 |
np.save('test/test_%s.npy' % fileName, [probs_bags]) |
|
|
376 |
|
|
|
377 |
|
|
|
378 |
prefix = 'test_' if test else 'val_' |
|
|
379 |
end_time = time() |
|
|
380 |
report['Time'] = end_time - start_time |
|
|
381 |
report.to_csv("report/%s_%s.csv" % (prefix, fileName)) |
|
|
382 |
print report |