--- a +++ b/lvl1/genPreds_RNN.py @@ -0,0 +1,219 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jul 8 21:56:55 2015. + +@author: rc, alex +""" +import os +import sys +if __name__ == '__main__' and __package__ is None: + filePath = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.append(filePath) + +import pandas as pd +import numpy as np +import yaml +from time import time +from copy import deepcopy +from progressbar import Bar, ETA, Percentage, ProgressBar, RotatingMarker +from sklearn.metrics import roc_auc_score +from sklearn.pipeline import make_pipeline + +from preprocessing.aux import load_raw_data +from ensembling.NeuralNet import NeuralNet + + +def _from_yaml_to_func(method, params): + """go from yaml to method. + + Need to be here for accesing local variables. + """ + prm = dict() + if params is not None: + for key, val in params.iteritems(): + prm[key] = eval(str(val)) + return eval(method)(**prm) + +# ## read model parameters ### +yml = yaml.load(open(sys.argv[1])) + +# Import package +for pkg, functions in yml['imports'].iteritems(): + stri = 'from ' + pkg + ' import ' + ','.join(functions) + exec(stri) + +fileName = yml['Meta']['file'] +training_params = yml['Training'] +architecture = yml['Architecture'] + +delay = training_params['delay'] +skip = training_params['skip'] +parts_train = training_params['parts_train'] +parts_test = training_params['parts_test'] +smallEpochs = training_params['smallEpochs'] +majorEpochs = training_params['majorEpochs'] +checkEveryEpochs = training_params['checkEveryEpochs'] +subsample = training_params['subsample'] + +# meta settings +cache_preprocessed = yml['Meta']['cachePreprocessed'] + +# preprocessing pipeline +pipe = [] +for item in yml['Preprocessing']: + for method, params in item.iteritems(): + pipe.append(_from_yaml_to_func(method, params)) +preprocess_base = make_pipeline(*pipe) + +# post preprocessing +postpreprocess_base = None +if 'PostPreprocessing' in yml.keys(): + pipe = [] + for item in yml['PostPreprocessing']: + for method, params in item.iteritems(): + pipe.append(_from_yaml_to_func(method, params)) + postpreprocess_base = make_pipeline(*pipe) + + +mode = sys.argv[2] +if mode == 'val': + test = False +elif mode == 'test': + test = True +else: + raise('Invalid mode. Please specify either val or test') + +if test: + folder = 'test/' + prefix = 'test_' +else: + folder = 'val/' + prefix = 'val_' + + +# required transformers + +print 'Running %s, to be saved in file %s' % (mode, fileName) + +saveFolder = folder + fileName +if not os.path.exists(saveFolder): + os.makedirs(saveFolder) + +# #### define lists ##### +subjects = range(1, 13) +widgets = ['Cross Val : ', Percentage(), ' ', Bar(marker=RotatingMarker()), + ' ', ETA(), ' '] +pbar = ProgressBar(widgets=widgets, maxval=len(subjects)) +pbar.start() + +report = pd.DataFrame(index=[fileName]) +start_time = time() + +np.random.seed(4234521) +# #### generate predictions ##### +for subject in subjects: + print 'Loading data for subject %d...' % subject + # ############### READ DATA ############################################### + data_train, labels_train, data_test, labels_test = load_raw_data(subject, + test) + cacheFile = '%s/train_sub%d.npy' % (saveFolder, subject) + + # copy processing pipeline to start fresh + preprocess = deepcopy(preprocess_base) + if postpreprocess_base is not None: + postpreprocess = deepcopy(postpreprocess_base) + else: + postpreprocess = None + + # ### preprocessing #### + print 'Preprocessing Training data...' + + if cache_preprocessed and os.path.isfile(cacheFile): + # if cache activated + file exist, load file + trainPreprocessed = np.load(cacheFile) + else: + # if not, do preprocessing + trainPreprocessed = preprocess.fit_transform(data_train, labels_train) + # if cache activated but no file, save + if cache_preprocessed: + np.save(cacheFile, trainPreprocessed) + + if postpreprocess is not None: + trainPreprocessed = postpreprocess.fit_transform(trainPreprocessed, + labels_train) + + trainPreprocessed[np.isnan(trainPreprocessed)] = 0 + # update subsampling for test Preprocessing + for name, step in preprocess.steps: + if hasattr(step, 'update_subsample'): + step.update_subsample(subsample, 1) + + if postpreprocess is not None: + for name, step in postpreprocess.steps: + if hasattr(step, 'update_subsample'): + step.update_subsample(subsample, 1) + + print 'Preprocessing Test data...' + cacheFile = '%s/test_sub%d.npy' % (saveFolder, subject) + + if cache_preprocessed and os.path.isfile(cacheFile): + # if cache activated + file exist, load file + testPreprocessed = np.load(cacheFile) + else: + # if not, do preprocessing + testPreprocessed = preprocess.transform(data_test) + # if cache activated but no file, save + if cache_preprocessed: + np.save(cacheFile, testPreprocessed) + + if postpreprocess is not None: + testPreprocessed = postpreprocess.transform(testPreprocessed) + testPreprocessed[np.isnan(testPreprocessed)] = 0 + + model = NeuralNet(None, architecture, training_params, + partsTrain=parts_train,partsTest=parts_test, + delay=delay,skip=skip,subsample=subsample, + majorEpochs=majorEpochs,smallEpochs=smallEpochs, + checkEveryEpochs=checkEveryEpochs) + + model.fit(trainPreprocessed,labels_train,testPreprocessed,labels_test) + + preds = model.predict_proba(testPreprocessed) + + if not test: + auc = np.mean([roc_auc_score(trueVals, p) for trueVals, p in + zip(labels_test.T, preds.T)]) + print("%d, test AUC : %.5f" % (subject, auc)) + + np.save('%s/sub%d.npy' % (saveFolder, subject), preds) + + # clear memory + preds = None + trainPreprocessed = None + testPreprocessed = None + + # update progress Bar + pbar.update(subject) + + +if not test: + labels = np.load('../infos_val.npy')[:, :-2] + +# ## AGGREGATE HERE +preds_tot = [] +for subject in subjects: + preds_tot.append(np.load('%s/sub%d.npy' % (saveFolder, subject))) +preds_tot = np.concatenate(preds_tot) +if not test: + auc = [roc_auc_score(trueVals, p) for trueVals, p in + zip(labels.transpose(), preds_tot.transpose())] + print np.mean(auc) + report['AUC'] = np.mean(auc) +preds_tot = [preds_tot] + +# ## save the model ### +np.save(folder + prefix + fileName + '.npy', preds_tot) +end_time = time() +report['Time'] = end_time - start_time +report.to_csv("report/%s_%s.csv" % (prefix, fileName)) +print report \ No newline at end of file