Diff of /lvl1/genPreds.py [000000] .. [21363a]

Switch to unified view

a b/lvl1/genPreds.py
1
# -*- coding: utf-8 -*-
2
"""
3
Created on Wed Jul  8 21:56:55 2015.
4
5
@author: alex, rc
6
7
This script contain code to generate lvl1 model prediction.
8
usage : python genPreds.py model_name mode
9
with mode = val for validation and val = test for test.
10
11
This script will read the model description from the yaml file, load
12
dependencies, create preprocessing and classification pipeline and apply them
13
on raw data independently on each subjects.
14
15
This script support caching of preprocessed data, in order to allow reuse of
16
preprocessing pipeline across model.
17
"""
18
import os
19
import sys
20
if __name__ == '__main__' and __package__ is None:
21
    filePath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
22
    sys.path.append(filePath)
23
24
import numpy as np
25
import pandas as pd
26
from time import time
27
from copy import deepcopy
28
import yaml
29
from sklearn.pipeline import make_pipeline, Pipeline
30
from progressbar import Bar, ETA, Percentage, ProgressBar, RotatingMarker
31
32
from sklearn.metrics import roc_auc_score
33
34
from preprocessing.aux import getEventNames, load_raw_data
35
36
from multiprocessing import Pool
37
cols = getEventNames()
38
39
40
def _from_yaml_to_func(method, params):
41
    """go from yaml to method.
42
43
    Need to be here for accesing local variables.
44
    """
45
    prm = dict()
46
    if params is not None:
47
        for key, val in params.iteritems():
48
            prm[key] = eval(str(val))
49
    return eval(method)(**prm)
50
51
52
def doCols(col):
53
    """Train and Predict for one event."""
54
    p = []
55
    for clf in clfs:
56
        clf.fit(trainPreprocessed, labels_train[:, col])
57
        p.append(clf.predict_proba(testPreprocessed)[:, 1])
58
    return p
59
60
61
yml = yaml.load(open(sys.argv[1]))
62
63
# Import package
64
for pkg, functions in yml['imports'].iteritems():
65
    stri = 'from ' + pkg + ' import ' + ','.join(functions)
66
    exec(stri)
67
68
# meta settings
69
fileName = yml['Meta']['file']
70
cores = yml['Meta']['cores']
71
subsample = yml['Meta']['subsample']
72
cache_preprocessed = yml['Meta']['cachePreprocessed']
73
74
if 'subsample_test' in yml['Meta'].keys():
75
    subsample_test = yml['Meta']['subsample_test']
76
else:
77
    subsample_test = 1
78
79
if 'addPreprocessed' in yml['Meta']:
80
    addPreprocessed = yml['Meta']['addPreprocessed']
81
else:
82
    addPreprocessed = []
83
84
# preprocessing pipeline
85
pipe = []
86
for item in yml['Preprocessing']:
87
    for method, params in item.iteritems():
88
        pipe.append(_from_yaml_to_func(method, params))
89
preprocess_base = make_pipeline(*pipe)
90
91
# post preprocessing
92
postpreprocess_base = None
93
if 'PostPreprocessing' in yml.keys():
94
    pipe = []
95
    for item in yml['PostPreprocessing']:
96
        for method, params in item.iteritems():
97
            pipe.append(_from_yaml_to_func(method, params))
98
    postpreprocess_base = make_pipeline(*pipe)
99
100
# models
101
clfs = []
102
for mdl in yml['Models']:
103
    clfs.append('Pipeline([ %s ])' % mdl)
104
105
for i, clf in enumerate(clfs):
106
    clfs[i] = eval(clf)
107
108
# ## read arguments ###
109
110
mode = sys.argv[2]
111
if mode == 'val':
112
    test = False
113
elif mode == 'test':
114
    test = True
115
else:
116
    raise('Invalid mode. Please specify either val or test')
117
118
if test:
119
    folder = 'test/'
120
    prefix = 'test_'
121
else:
122
    folder = 'val/'
123
    prefix = 'val_'
124
125
126
print 'Running %s, to be saved in file %s' % (mode, fileName)
127
128
saveFolder = folder + fileName
129
if not os.path.exists(saveFolder):
130
    os.makedirs(saveFolder)
131
132
# #### define lists #####
133
subjects = range(1, 13)
134
widgets = ['Cross Val : ', Percentage(), ' ', Bar(marker=RotatingMarker()),
135
           ' ', ETA(), ' ']
136
pbar = ProgressBar(widgets=widgets, maxval=len(subjects))
137
pbar.start()
138
139
report = pd.DataFrame(index=[fileName])
140
start_time = time()
141
# #### generate predictions #####
142
for subject in subjects:
143
    print 'Loading data for subject %d...' % subject
144
    # ############### READ DATA ###############################################
145
    data_train, labels_train, data_test, labels_test = load_raw_data(subject,
146
                                                                     test)
147
148
    trainPreprocessed = None
149
    testPreprocessed = None
150
    cacheFile = '%s/train_sub%d.npy' % (saveFolder, subject)
151
    # copy processing pipeline to start fresh
152
    preprocess = deepcopy(preprocess_base)
153
    if postpreprocess_base is not None:
154
        postpreprocess = deepcopy(postpreprocess_base)
155
    else:
156
        postpreprocess = None
157
158
    # ### preprocessing ####
159
    print 'Preprocessing Training data...'
160
161
    if cache_preprocessed and os.path.isfile(cacheFile):
162
        # if cache activated + file exist, load file
163
        trainPreprocessed = np.load(cacheFile)
164
    else:
165
        # if not, do preprocessing
166
        trainPreprocessed = preprocess.fit_transform(data_train, labels_train)
167
        # if cache activated but no file, save
168
        if cache_preprocessed:
169
            np.save(cacheFile, trainPreprocessed)
170
            trainPreprocessed = None
171
    data_train = None
172
173
    print 'Preprocessing Test data...'
174
    cacheFile = '%s/test_sub%d.npy' % (saveFolder, subject)
175
176
    # update subsampling for test Preprocessing
177
    for name, step in preprocess.steps:
178
        if hasattr(step, 'update_subsample'):
179
            step.update_subsample(subsample, subsample_test)
180
181
    if cache_preprocessed and os.path.isfile(cacheFile):
182
        # if cache activated + file exist, load file
183
        testPreprocessed = np.load(cacheFile)
184
    else:
185
        # if not, do preprocessing
186
        testPreprocessed = preprocess.transform(data_test)
187
        # if cache activated but no file, save
188
        if cache_preprocessed:
189
            np.save(cacheFile, testPreprocessed)
190
    data_test = None
191
192
    print 'Post Preprocessing data...'
193
    if cache_preprocessed and (trainPreprocessed is None):
194
        # if cache activated load file
195
        cacheFile = '%s/train_sub%d.npy' % (saveFolder, subject)
196
        trainPreprocessed = np.load(cacheFile)
197
198
    # Add preprocessed feature if they have been set in the config file
199
    for feat_name in addPreprocessed:
200
        featFile = '%s/%s/train_sub%d.npy' % (folder, feat_name, subject)
201
        if os.path.isfile(featFile):
202
            feat = np.load(featFile)
203
            if trainPreprocessed is None:
204
                trainPreprocessed = feat
205
            else:
206
                trainPreprocessed = np.c_[trainPreprocessed, feat]
207
            feat = None
208
        else:
209
            raise ValueError("File %s does not exist" % featFile)
210
211
    # Add preprocessed feature if they have been set in the config file
212
    for feat_name in addPreprocessed:
213
        featFile = '%s/%s/test_sub%d.npy' % (folder, feat_name, subject)
214
        if os.path.isfile(featFile):
215
            feat = np.load(featFile)
216
            if testPreprocessed is None:
217
                testPreprocessed = feat
218
            else:
219
                testPreprocessed = np.c_[testPreprocessed, feat]
220
            feat = None
221
        else:
222
            raise ValueError('File %s does not exist' % featFile)
223
224
    trainPreprocessed[np.isnan(trainPreprocessed)] = 0
225
    testPreprocessed[np.isnan(testPreprocessed)] = 0
226
227
    if postpreprocess is not None:
228
        trainPreprocessed = postpreprocess.fit_transform(trainPreprocessed,
229
                                                         labels_train)
230
        for name, step in postpreprocess.steps:
231
            if hasattr(step, 'update_subsample'):
232
                step.update_subsample(subsample, subsample_test)
233
234
        testPreprocessed = postpreprocess.transform(testPreprocessed)
235
236
    print 'Training models...'
237
    labels_train = labels_train[::subsample]
238
    if cores == 1:
239
        preds = [doCols(i) for i in range(len(cols))]
240
    else:
241
        pool = Pool(processes=cores)
242
        preds = pool.map(doCols, range(len(cols)))
243
        pool.close()
244
    # ### results #####
245
    print 'Aggregating results...'
246
    for i in range(len(clfs)):
247
        pred_i = [j[i] for j in preds]
248
        pred_i = np.array(np.vstack(pred_i)).transpose()
249
        np.save('%s/sub%d_clf%d.npy' % (saveFolder, subject, i), pred_i)
250
        if not test:
251
            auc = np.mean([roc_auc_score(trueVals, p) for trueVals, p in
252
                          zip(labels_test[::subsample_test].T, pred_i.T)])
253
            print '%d, clf %d: %.5f' % (subject, i, auc)
254
255
    # clear memory
256
    preds = None
257
    trainPreprocessed = None
258
    testPreprocessed = None
259
260
    # update progress Bar
261
    pbar.update(subject)
262
263
264
if not test:
265
    labels = np.load('../infos_val.npy')[:, :-2]
266
267
# ## AGGREGATE HERE
268
preds_tot = []
269
270
for i in range(len(clfs)):
271
    preds_tot.append([])
272
    for subject in subjects:
273
        preds_tot[i].append(np.load('%s/sub%d_clf%d.npy' % (saveFolder,
274
                                                            subject, i)))
275
    preds_tot[i] = np.concatenate(preds_tot[i])
276
    if not test:
277
        auc = [roc_auc_score(trueVals, p) for trueVals, p in
278
               zip(labels[::subsample_test].T, preds_tot[i].T)]
279
        report['AUC'] = np.mean(auc)
280
        print np.mean(auc)
281
282
# ## save the model ###
283
np.save(folder + prefix + fileName + '.npy', preds_tot)
284
285
# ## save report
286
end_time = time()
287
report['Time'] = end_time - start_time
288
report.to_csv("report/%s_%s.csv" % (prefix, fileName))
289
print report