Diff of /lvl1/genPreds_RNN.py [000000] .. [21363a]

Switch to side-by-side view

--- a
+++ b/lvl1/genPreds_RNN.py
@@ -0,0 +1,219 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul  8 21:56:55 2015.
+
+@author: rc, alex
+"""
+import os
+import sys
+if __name__ == '__main__' and __package__ is None:
+    filePath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    sys.path.append(filePath)
+
+import pandas as pd
+import numpy as np
+import yaml
+from time import time
+from copy import deepcopy
+from progressbar import Bar, ETA, Percentage, ProgressBar, RotatingMarker
+from sklearn.metrics import roc_auc_score
+from sklearn.pipeline import make_pipeline
+
+from preprocessing.aux import load_raw_data
+from ensembling.NeuralNet import NeuralNet
+
+
+def _from_yaml_to_func(method, params):
+    """go from yaml to method.
+
+    Need to be here for accesing local variables.
+    """
+    prm = dict()
+    if params is not None:
+        for key, val in params.iteritems():
+            prm[key] = eval(str(val))
+    return eval(method)(**prm)
+
+# ## read model parameters ###
+yml = yaml.load(open(sys.argv[1]))
+
+# Import package
+for pkg, functions in yml['imports'].iteritems():
+    stri = 'from ' + pkg + ' import ' + ','.join(functions)
+    exec(stri)
+
+fileName = yml['Meta']['file']
+training_params = yml['Training']
+architecture = yml['Architecture']
+
+delay = training_params['delay']
+skip = training_params['skip']
+parts_train = training_params['parts_train']
+parts_test = training_params['parts_test']
+smallEpochs = training_params['smallEpochs']
+majorEpochs = training_params['majorEpochs']
+checkEveryEpochs = training_params['checkEveryEpochs']
+subsample = training_params['subsample']
+
+# meta settings
+cache_preprocessed = yml['Meta']['cachePreprocessed']
+
+# preprocessing pipeline
+pipe = []
+for item in yml['Preprocessing']:
+    for method, params in item.iteritems():
+        pipe.append(_from_yaml_to_func(method, params))
+preprocess_base = make_pipeline(*pipe)
+
+# post preprocessing
+postpreprocess_base = None
+if 'PostPreprocessing' in yml.keys():
+    pipe = []
+    for item in yml['PostPreprocessing']:
+        for method, params in item.iteritems():
+            pipe.append(_from_yaml_to_func(method, params))
+    postpreprocess_base = make_pipeline(*pipe)
+
+
+mode = sys.argv[2]
+if mode == 'val':
+    test = False
+elif mode == 'test':
+    test = True
+else:
+    raise('Invalid mode. Please specify either val or test')
+
+if test:
+    folder = 'test/'
+    prefix = 'test_'
+else:
+    folder = 'val/'
+    prefix = 'val_'
+
+
+# required transformers
+
+print 'Running %s, to be saved in file %s' % (mode, fileName)
+
+saveFolder = folder + fileName
+if not os.path.exists(saveFolder):
+    os.makedirs(saveFolder)
+
+# #### define lists #####
+subjects = range(1, 13)
+widgets = ['Cross Val : ', Percentage(), ' ', Bar(marker=RotatingMarker()),
+           ' ', ETA(), ' ']
+pbar = ProgressBar(widgets=widgets, maxval=len(subjects))
+pbar.start()
+
+report = pd.DataFrame(index=[fileName])
+start_time = time()
+
+np.random.seed(4234521)
+# #### generate predictions #####
+for subject in subjects:
+    print 'Loading data for subject %d...' % subject
+    # ############### READ DATA ###############################################
+    data_train, labels_train, data_test, labels_test = load_raw_data(subject,
+                                                                     test)
+    cacheFile = '%s/train_sub%d.npy' % (saveFolder, subject)
+
+    # copy processing pipeline to start fresh
+    preprocess = deepcopy(preprocess_base)
+    if postpreprocess_base is not None:
+        postpreprocess = deepcopy(postpreprocess_base)
+    else:
+        postpreprocess = None
+
+    # ### preprocessing ####
+    print 'Preprocessing Training data...'
+
+    if cache_preprocessed and os.path.isfile(cacheFile):
+        # if cache activated + file exist, load file
+        trainPreprocessed = np.load(cacheFile)
+    else:
+        # if not, do preprocessing
+        trainPreprocessed = preprocess.fit_transform(data_train, labels_train)
+        # if cache activated but no file, save
+        if cache_preprocessed:
+            np.save(cacheFile, trainPreprocessed)
+
+    if postpreprocess is not None:
+        trainPreprocessed = postpreprocess.fit_transform(trainPreprocessed,
+                                                         labels_train)
+
+    trainPreprocessed[np.isnan(trainPreprocessed)] = 0
+    # update subsampling for test Preprocessing
+    for name, step in preprocess.steps:
+        if hasattr(step, 'update_subsample'):
+            step.update_subsample(subsample, 1)
+
+    if postpreprocess is not None:
+        for name, step in postpreprocess.steps:
+            if hasattr(step, 'update_subsample'):
+                step.update_subsample(subsample, 1)
+
+    print 'Preprocessing Test data...'
+    cacheFile = '%s/test_sub%d.npy' % (saveFolder, subject)
+
+    if cache_preprocessed and os.path.isfile(cacheFile):
+        # if cache activated + file exist, load file
+        testPreprocessed = np.load(cacheFile)
+    else:
+        # if not, do preprocessing
+        testPreprocessed = preprocess.transform(data_test)
+        # if cache activated but no file, save
+        if cache_preprocessed:
+            np.save(cacheFile, testPreprocessed)
+
+    if postpreprocess is not None:
+        testPreprocessed = postpreprocess.transform(testPreprocessed)
+    testPreprocessed[np.isnan(testPreprocessed)] = 0
+
+    model = NeuralNet(None, architecture, training_params, 
+                      partsTrain=parts_train,partsTest=parts_test,
+                      delay=delay,skip=skip,subsample=subsample,
+                      majorEpochs=majorEpochs,smallEpochs=smallEpochs,
+                      checkEveryEpochs=checkEveryEpochs)
+
+    model.fit(trainPreprocessed,labels_train,testPreprocessed,labels_test)
+    
+    preds = model.predict_proba(testPreprocessed)
+
+    if not test:
+        auc = np.mean([roc_auc_score(trueVals, p) for trueVals, p in
+                      zip(labels_test.T, preds.T)])
+        print("%d, test AUC : %.5f" % (subject, auc))
+
+    np.save('%s/sub%d.npy' % (saveFolder, subject), preds)
+
+    # clear memory
+    preds = None
+    trainPreprocessed = None
+    testPreprocessed = None
+
+    # update progress Bar
+    pbar.update(subject)
+
+
+if not test:
+    labels = np.load('../infos_val.npy')[:, :-2]
+
+# ## AGGREGATE HERE
+preds_tot = []
+for subject in subjects:
+    preds_tot.append(np.load('%s/sub%d.npy' % (saveFolder, subject)))
+preds_tot = np.concatenate(preds_tot)
+if not test:
+    auc = [roc_auc_score(trueVals, p) for trueVals, p in
+                  zip(labels.transpose(), preds_tot.transpose())]
+    print np.mean(auc)
+    report['AUC'] = np.mean(auc)
+preds_tot = [preds_tot]
+
+# ## save the model ###
+np.save(folder + prefix + fileName + '.npy', preds_tot)
+end_time = time()
+report['Time'] = end_time - start_time
+report.to_csv("report/%s_%s.csv" % (prefix, fileName))
+print report
\ No newline at end of file