[21363a]: / lvl3 / genFinal.py

Download this file

104 lines (86 with data), 2.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 15 14:12:12 2015
@author: rc, alex
"""
import os
import sys
if __name__ == '__main__' and __package__ is None:
filePath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(filePath)
import pandas as pd
import numpy as np
import yaml
from collections import OrderedDict
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import LeaveOneLabelOut
from preprocessing.aux import getEventNames
from utils.ensembles import createEnsFunc, loadPredictions
from ensembling.WeightedMean import WeightedMeanClassifier
yml = yaml.load(open(sys.argv[1]))
fileName = yml['fileName']
ensemble = yml['ensemble']
subsample = yml['subsample'] if 'subsample' in yml else 1
seed = yml['seed'] if 'seed' in yml else 4234521
mean_type = yml['mean_type'] if 'mean_type' in yml else 'arithmetic'
verbose = yml['verbose'] if 'verbose' in yml else True
print mean_type
print ensemble
np.random.seed(seed)
print 'Running weighted mean ensemble, results will be saved in submissions/%s.csv' % fileName
models = []
for m in mean_type:
models.append(WeightedMeanClassifier(ensemble, mean=m, verbose=verbose))
######
cols = getEventNames()
ids = np.load('../infos_test.npy')
subjects_test = ids[:, 1]
series_test = ids[:, 2]
ids = ids[:, 0]
labels = np.load('../infos_val.npy')
subjects = labels[:, -2]
series = labels[:, -1]
labels = labels[:, :-2]
allCols = range(len(cols))
# ## loading prediction ###
files = ensemble
preds_val = OrderedDict()
for f in files:
loadPredictions(preds_val, f, [f], lvl=2)
# ## train/test ###
aggr = createEnsFunc(ensemble)
dataTrain = aggr(preds_val)
preds_val = None
# do CV
aucs = []
cv = LeaveOneLabelOut(series)
p = np.zeros(labels.shape)
for train,test in cv:
currentSeries = np.unique(series[test])[0]
for m in range(len(models)):
models[m].fit(dataTrain[train][::subsample], labels[train][::subsample])
p[test] += models[m].predict_proba(dataTrain[test]) / len(mean_type)
aucs.append(np.mean([roc_auc_score(labels[test],p[test])]))
print 'score on series %d: %.5f' % (currentSeries, aucs[-1])
print 'CV score: %.5f / %.6f' % (np.mean(aucs), np.std(aucs))
np.save('val/val_%s.npy'%fileName,[p])
# train WMs on all training data
models = []
for m in mean_type:
wm = WeightedMeanClassifier(ensemble, mean=m, verbose=verbose)
wm.fit(dataTrain[::subsample], labels[::subsample])
models.append(wm)
dataTrain = None
# load test data
preds_test = OrderedDict()
for f in files:
loadPredictions(preds_test, f, [f], lvl=2, test=True)
dataTest = aggr(preds_test)
preds_test = None
# get predictions
p = 0
for m in range(len(models)):
p += models[m].predict_proba(dataTest) / len(models)
# generate submission
sub = pd.DataFrame(data=p,index=ids,columns=cols)
sub.to_csv('submissions/%s.csv'%fileName,index_label='id',float_format='%.8f')