Diff of /lvl1/genPreds.py [000000] .. [21363a]

Switch to side-by-side view

--- a
+++ b/lvl1/genPreds.py
@@ -0,0 +1,289 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul  8 21:56:55 2015.
+
+@author: alex, rc
+
+This script contain code to generate lvl1 model prediction.
+usage : python genPreds.py model_name mode
+with mode = val for validation and val = test for test.
+
+This script will read the model description from the yaml file, load
+dependencies, create preprocessing and classification pipeline and apply them
+on raw data independently on each subjects.
+
+This script support caching of preprocessed data, in order to allow reuse of
+preprocessing pipeline across model.
+"""
+import os
+import sys
+if __name__ == '__main__' and __package__ is None:
+    filePath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    sys.path.append(filePath)
+
+import numpy as np
+import pandas as pd
+from time import time
+from copy import deepcopy
+import yaml
+from sklearn.pipeline import make_pipeline, Pipeline
+from progressbar import Bar, ETA, Percentage, ProgressBar, RotatingMarker
+
+from sklearn.metrics import roc_auc_score
+
+from preprocessing.aux import getEventNames, load_raw_data
+
+from multiprocessing import Pool
+cols = getEventNames()
+
+
+def _from_yaml_to_func(method, params):
+    """go from yaml to method.
+
+    Need to be here for accesing local variables.
+    """
+    prm = dict()
+    if params is not None:
+        for key, val in params.iteritems():
+            prm[key] = eval(str(val))
+    return eval(method)(**prm)
+
+
+def doCols(col):
+    """Train and Predict for one event."""
+    p = []
+    for clf in clfs:
+        clf.fit(trainPreprocessed, labels_train[:, col])
+        p.append(clf.predict_proba(testPreprocessed)[:, 1])
+    return p
+
+
+yml = yaml.load(open(sys.argv[1]))
+
+# Import package
+for pkg, functions in yml['imports'].iteritems():
+    stri = 'from ' + pkg + ' import ' + ','.join(functions)
+    exec(stri)
+
+# meta settings
+fileName = yml['Meta']['file']
+cores = yml['Meta']['cores']
+subsample = yml['Meta']['subsample']
+cache_preprocessed = yml['Meta']['cachePreprocessed']
+
+if 'subsample_test' in yml['Meta'].keys():
+    subsample_test = yml['Meta']['subsample_test']
+else:
+    subsample_test = 1
+
+if 'addPreprocessed' in yml['Meta']:
+    addPreprocessed = yml['Meta']['addPreprocessed']
+else:
+    addPreprocessed = []
+
+# preprocessing pipeline
+pipe = []
+for item in yml['Preprocessing']:
+    for method, params in item.iteritems():
+        pipe.append(_from_yaml_to_func(method, params))
+preprocess_base = make_pipeline(*pipe)
+
+# post preprocessing
+postpreprocess_base = None
+if 'PostPreprocessing' in yml.keys():
+    pipe = []
+    for item in yml['PostPreprocessing']:
+        for method, params in item.iteritems():
+            pipe.append(_from_yaml_to_func(method, params))
+    postpreprocess_base = make_pipeline(*pipe)
+
+# models
+clfs = []
+for mdl in yml['Models']:
+    clfs.append('Pipeline([ %s ])' % mdl)
+
+for i, clf in enumerate(clfs):
+    clfs[i] = eval(clf)
+
+# ## read arguments ###
+
+mode = sys.argv[2]
+if mode == 'val':
+    test = False
+elif mode == 'test':
+    test = True
+else:
+    raise('Invalid mode. Please specify either val or test')
+
+if test:
+    folder = 'test/'
+    prefix = 'test_'
+else:
+    folder = 'val/'
+    prefix = 'val_'
+
+
+print 'Running %s, to be saved in file %s' % (mode, fileName)
+
+saveFolder = folder + fileName
+if not os.path.exists(saveFolder):
+    os.makedirs(saveFolder)
+
+# #### define lists #####
+subjects = range(1, 13)
+widgets = ['Cross Val : ', Percentage(), ' ', Bar(marker=RotatingMarker()),
+           ' ', ETA(), ' ']
+pbar = ProgressBar(widgets=widgets, maxval=len(subjects))
+pbar.start()
+
+report = pd.DataFrame(index=[fileName])
+start_time = time()
+# #### generate predictions #####
+for subject in subjects:
+    print 'Loading data for subject %d...' % subject
+    # ############### READ DATA ###############################################
+    data_train, labels_train, data_test, labels_test = load_raw_data(subject,
+                                                                     test)
+
+    trainPreprocessed = None
+    testPreprocessed = None
+    cacheFile = '%s/train_sub%d.npy' % (saveFolder, subject)
+    # copy processing pipeline to start fresh
+    preprocess = deepcopy(preprocess_base)
+    if postpreprocess_base is not None:
+        postpreprocess = deepcopy(postpreprocess_base)
+    else:
+        postpreprocess = None
+
+    # ### preprocessing ####
+    print 'Preprocessing Training data...'
+
+    if cache_preprocessed and os.path.isfile(cacheFile):
+        # if cache activated + file exist, load file
+        trainPreprocessed = np.load(cacheFile)
+    else:
+        # if not, do preprocessing
+        trainPreprocessed = preprocess.fit_transform(data_train, labels_train)
+        # if cache activated but no file, save
+        if cache_preprocessed:
+            np.save(cacheFile, trainPreprocessed)
+            trainPreprocessed = None
+    data_train = None
+
+    print 'Preprocessing Test data...'
+    cacheFile = '%s/test_sub%d.npy' % (saveFolder, subject)
+
+    # update subsampling for test Preprocessing
+    for name, step in preprocess.steps:
+        if hasattr(step, 'update_subsample'):
+            step.update_subsample(subsample, subsample_test)
+
+    if cache_preprocessed and os.path.isfile(cacheFile):
+        # if cache activated + file exist, load file
+        testPreprocessed = np.load(cacheFile)
+    else:
+        # if not, do preprocessing
+        testPreprocessed = preprocess.transform(data_test)
+        # if cache activated but no file, save
+        if cache_preprocessed:
+            np.save(cacheFile, testPreprocessed)
+    data_test = None
+
+    print 'Post Preprocessing data...'
+    if cache_preprocessed and (trainPreprocessed is None):
+        # if cache activated load file
+        cacheFile = '%s/train_sub%d.npy' % (saveFolder, subject)
+        trainPreprocessed = np.load(cacheFile)
+
+    # Add preprocessed feature if they have been set in the config file
+    for feat_name in addPreprocessed:
+        featFile = '%s/%s/train_sub%d.npy' % (folder, feat_name, subject)
+        if os.path.isfile(featFile):
+            feat = np.load(featFile)
+            if trainPreprocessed is None:
+                trainPreprocessed = feat
+            else:
+                trainPreprocessed = np.c_[trainPreprocessed, feat]
+            feat = None
+        else:
+            raise ValueError("File %s does not exist" % featFile)
+
+    # Add preprocessed feature if they have been set in the config file
+    for feat_name in addPreprocessed:
+        featFile = '%s/%s/test_sub%d.npy' % (folder, feat_name, subject)
+        if os.path.isfile(featFile):
+            feat = np.load(featFile)
+            if testPreprocessed is None:
+                testPreprocessed = feat
+            else:
+                testPreprocessed = np.c_[testPreprocessed, feat]
+            feat = None
+        else:
+            raise ValueError('File %s does not exist' % featFile)
+
+    trainPreprocessed[np.isnan(trainPreprocessed)] = 0
+    testPreprocessed[np.isnan(testPreprocessed)] = 0
+
+    if postpreprocess is not None:
+        trainPreprocessed = postpreprocess.fit_transform(trainPreprocessed,
+                                                         labels_train)
+        for name, step in postpreprocess.steps:
+            if hasattr(step, 'update_subsample'):
+                step.update_subsample(subsample, subsample_test)
+
+        testPreprocessed = postpreprocess.transform(testPreprocessed)
+
+    print 'Training models...'
+    labels_train = labels_train[::subsample]
+    if cores == 1:
+        preds = [doCols(i) for i in range(len(cols))]
+    else:
+        pool = Pool(processes=cores)
+        preds = pool.map(doCols, range(len(cols)))
+        pool.close()
+    # ### results #####
+    print 'Aggregating results...'
+    for i in range(len(clfs)):
+        pred_i = [j[i] for j in preds]
+        pred_i = np.array(np.vstack(pred_i)).transpose()
+        np.save('%s/sub%d_clf%d.npy' % (saveFolder, subject, i), pred_i)
+        if not test:
+            auc = np.mean([roc_auc_score(trueVals, p) for trueVals, p in
+                          zip(labels_test[::subsample_test].T, pred_i.T)])
+            print '%d, clf %d: %.5f' % (subject, i, auc)
+
+    # clear memory
+    preds = None
+    trainPreprocessed = None
+    testPreprocessed = None
+
+    # update progress Bar
+    pbar.update(subject)
+
+
+if not test:
+    labels = np.load('../infos_val.npy')[:, :-2]
+
+# ## AGGREGATE HERE
+preds_tot = []
+
+for i in range(len(clfs)):
+    preds_tot.append([])
+    for subject in subjects:
+        preds_tot[i].append(np.load('%s/sub%d_clf%d.npy' % (saveFolder,
+                                                            subject, i)))
+    preds_tot[i] = np.concatenate(preds_tot[i])
+    if not test:
+        auc = [roc_auc_score(trueVals, p) for trueVals, p in
+               zip(labels[::subsample_test].T, preds_tot[i].T)]
+        report['AUC'] = np.mean(auc)
+        print np.mean(auc)
+
+# ## save the model ###
+np.save(folder + prefix + fileName + '.npy', preds_tot)
+
+# ## save report
+end_time = time()
+report['Time'] = end_time - start_time
+report.to_csv("report/%s_%s.csv" % (prefix, fileName))
+print report