Diff of /lvl1/genPreds_RNN.py [000000] .. [21363a]

Switch to unified view

a b/lvl1/genPreds_RNN.py
1
# -*- coding: utf-8 -*-
2
"""
3
Created on Wed Jul  8 21:56:55 2015.
4
5
@author: rc, alex
6
"""
7
import os
8
import sys
9
if __name__ == '__main__' and __package__ is None:
10
    filePath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
11
    sys.path.append(filePath)
12
13
import pandas as pd
14
import numpy as np
15
import yaml
16
from time import time
17
from copy import deepcopy
18
from progressbar import Bar, ETA, Percentage, ProgressBar, RotatingMarker
19
from sklearn.metrics import roc_auc_score
20
from sklearn.pipeline import make_pipeline
21
22
from preprocessing.aux import load_raw_data
23
from ensembling.NeuralNet import NeuralNet
24
25
26
def _from_yaml_to_func(method, params):
27
    """go from yaml to method.
28
29
    Need to be here for accesing local variables.
30
    """
31
    prm = dict()
32
    if params is not None:
33
        for key, val in params.iteritems():
34
            prm[key] = eval(str(val))
35
    return eval(method)(**prm)
36
37
# ## read model parameters ###
38
yml = yaml.load(open(sys.argv[1]))
39
40
# Import package
41
for pkg, functions in yml['imports'].iteritems():
42
    stri = 'from ' + pkg + ' import ' + ','.join(functions)
43
    exec(stri)
44
45
fileName = yml['Meta']['file']
46
training_params = yml['Training']
47
architecture = yml['Architecture']
48
49
delay = training_params['delay']
50
skip = training_params['skip']
51
parts_train = training_params['parts_train']
52
parts_test = training_params['parts_test']
53
smallEpochs = training_params['smallEpochs']
54
majorEpochs = training_params['majorEpochs']
55
checkEveryEpochs = training_params['checkEveryEpochs']
56
subsample = training_params['subsample']
57
58
# meta settings
59
cache_preprocessed = yml['Meta']['cachePreprocessed']
60
61
# preprocessing pipeline
62
pipe = []
63
for item in yml['Preprocessing']:
64
    for method, params in item.iteritems():
65
        pipe.append(_from_yaml_to_func(method, params))
66
preprocess_base = make_pipeline(*pipe)
67
68
# post preprocessing
69
postpreprocess_base = None
70
if 'PostPreprocessing' in yml.keys():
71
    pipe = []
72
    for item in yml['PostPreprocessing']:
73
        for method, params in item.iteritems():
74
            pipe.append(_from_yaml_to_func(method, params))
75
    postpreprocess_base = make_pipeline(*pipe)
76
77
78
mode = sys.argv[2]
79
if mode == 'val':
80
    test = False
81
elif mode == 'test':
82
    test = True
83
else:
84
    raise('Invalid mode. Please specify either val or test')
85
86
if test:
87
    folder = 'test/'
88
    prefix = 'test_'
89
else:
90
    folder = 'val/'
91
    prefix = 'val_'
92
93
94
# required transformers
95
96
print 'Running %s, to be saved in file %s' % (mode, fileName)
97
98
saveFolder = folder + fileName
99
if not os.path.exists(saveFolder):
100
    os.makedirs(saveFolder)
101
102
# #### define lists #####
103
subjects = range(1, 13)
104
widgets = ['Cross Val : ', Percentage(), ' ', Bar(marker=RotatingMarker()),
105
           ' ', ETA(), ' ']
106
pbar = ProgressBar(widgets=widgets, maxval=len(subjects))
107
pbar.start()
108
109
report = pd.DataFrame(index=[fileName])
110
start_time = time()
111
112
np.random.seed(4234521)
113
# #### generate predictions #####
114
for subject in subjects:
115
    print 'Loading data for subject %d...' % subject
116
    # ############### READ DATA ###############################################
117
    data_train, labels_train, data_test, labels_test = load_raw_data(subject,
118
                                                                     test)
119
    cacheFile = '%s/train_sub%d.npy' % (saveFolder, subject)
120
121
    # copy processing pipeline to start fresh
122
    preprocess = deepcopy(preprocess_base)
123
    if postpreprocess_base is not None:
124
        postpreprocess = deepcopy(postpreprocess_base)
125
    else:
126
        postpreprocess = None
127
128
    # ### preprocessing ####
129
    print 'Preprocessing Training data...'
130
131
    if cache_preprocessed and os.path.isfile(cacheFile):
132
        # if cache activated + file exist, load file
133
        trainPreprocessed = np.load(cacheFile)
134
    else:
135
        # if not, do preprocessing
136
        trainPreprocessed = preprocess.fit_transform(data_train, labels_train)
137
        # if cache activated but no file, save
138
        if cache_preprocessed:
139
            np.save(cacheFile, trainPreprocessed)
140
141
    if postpreprocess is not None:
142
        trainPreprocessed = postpreprocess.fit_transform(trainPreprocessed,
143
                                                         labels_train)
144
145
    trainPreprocessed[np.isnan(trainPreprocessed)] = 0
146
    # update subsampling for test Preprocessing
147
    for name, step in preprocess.steps:
148
        if hasattr(step, 'update_subsample'):
149
            step.update_subsample(subsample, 1)
150
151
    if postpreprocess is not None:
152
        for name, step in postpreprocess.steps:
153
            if hasattr(step, 'update_subsample'):
154
                step.update_subsample(subsample, 1)
155
156
    print 'Preprocessing Test data...'
157
    cacheFile = '%s/test_sub%d.npy' % (saveFolder, subject)
158
159
    if cache_preprocessed and os.path.isfile(cacheFile):
160
        # if cache activated + file exist, load file
161
        testPreprocessed = np.load(cacheFile)
162
    else:
163
        # if not, do preprocessing
164
        testPreprocessed = preprocess.transform(data_test)
165
        # if cache activated but no file, save
166
        if cache_preprocessed:
167
            np.save(cacheFile, testPreprocessed)
168
169
    if postpreprocess is not None:
170
        testPreprocessed = postpreprocess.transform(testPreprocessed)
171
    testPreprocessed[np.isnan(testPreprocessed)] = 0
172
173
    model = NeuralNet(None, architecture, training_params, 
174
                      partsTrain=parts_train,partsTest=parts_test,
175
                      delay=delay,skip=skip,subsample=subsample,
176
                      majorEpochs=majorEpochs,smallEpochs=smallEpochs,
177
                      checkEveryEpochs=checkEveryEpochs)
178
179
    model.fit(trainPreprocessed,labels_train,testPreprocessed,labels_test)
180
    
181
    preds = model.predict_proba(testPreprocessed)
182
183
    if not test:
184
        auc = np.mean([roc_auc_score(trueVals, p) for trueVals, p in
185
                      zip(labels_test.T, preds.T)])
186
        print("%d, test AUC : %.5f" % (subject, auc))
187
188
    np.save('%s/sub%d.npy' % (saveFolder, subject), preds)
189
190
    # clear memory
191
    preds = None
192
    trainPreprocessed = None
193
    testPreprocessed = None
194
195
    # update progress Bar
196
    pbar.update(subject)
197
198
199
if not test:
200
    labels = np.load('../infos_val.npy')[:, :-2]
201
202
# ## AGGREGATE HERE
203
preds_tot = []
204
for subject in subjects:
205
    preds_tot.append(np.load('%s/sub%d.npy' % (saveFolder, subject)))
206
preds_tot = np.concatenate(preds_tot)
207
if not test:
208
    auc = [roc_auc_score(trueVals, p) for trueVals, p in
209
                  zip(labels.transpose(), preds_tot.transpose())]
210
    print np.mean(auc)
211
    report['AUC'] = np.mean(auc)
212
preds_tot = [preds_tot]
213
214
# ## save the model ###
215
np.save(folder + prefix + fileName + '.npy', preds_tot)
216
end_time = time()
217
report['Time'] = end_time - start_time
218
report.to_csv("report/%s_%s.csv" % (prefix, fileName))
219
print report