|
a |
|
b/ensembling/WeightedMean.py |
|
|
1 |
# -*- coding: utf-8 -*- |
|
|
2 |
""" |
|
|
3 |
Created on Sat Aug 15 14:12:12 2015. |
|
|
4 |
|
|
|
5 |
@author: rc, alex |
|
|
6 |
""" |
|
|
7 |
import numpy as np |
|
|
8 |
from collections import OrderedDict |
|
|
9 |
from sklearn.base import BaseEstimator, ClassifierMixin |
|
|
10 |
from sklearn.metrics import roc_auc_score |
|
|
11 |
from hyperopt import fmin, tpe, hp |
|
|
12 |
|
|
|
13 |
from progressbar import Bar, ETA, Percentage, ProgressBar, RotatingMarker |
|
|
14 |
|
|
|
15 |
|
|
|
16 |
class WeightedMeanClassifier(BaseEstimator, ClassifierMixin): |
|
|
17 |
|
|
|
18 |
"""Weigted mean classifier with AUC optimization.""" |
|
|
19 |
|
|
|
20 |
def __init__(self, ensemble, step=0.025, max_evals=100, mean='arithmetic', |
|
|
21 |
verbose=True): |
|
|
22 |
"""Init.""" |
|
|
23 |
self.ensemble = ensemble |
|
|
24 |
self.step = step |
|
|
25 |
self.max_evals = max_evals |
|
|
26 |
self.mean = mean |
|
|
27 |
self.count = -1 |
|
|
28 |
self.verbose = verbose |
|
|
29 |
|
|
|
30 |
self.param_space = OrderedDict() |
|
|
31 |
for model in ensemble: |
|
|
32 |
self.param_space[model] = hp.quniform(model, 0, 3, self.step) |
|
|
33 |
|
|
|
34 |
# input data are arranged in a particular order, whereas hyperopt uses |
|
|
35 |
# unordered lists when optimizing. The model has to keep track |
|
|
36 |
# of the initial order so that correct weights are applied to columns |
|
|
37 |
self.sorting = dict() |
|
|
38 |
for i, m in enumerate(self.ensemble): |
|
|
39 |
self.sorting[m] = i |
|
|
40 |
|
|
|
41 |
def fit(self, X, y): |
|
|
42 |
"""Fit.""" |
|
|
43 |
self.best_params = None |
|
|
44 |
if self.mean != 'simple': |
|
|
45 |
if self.verbose: |
|
|
46 |
widgets = ['Training : ', Percentage(), ' ', Bar(marker=RotatingMarker()), |
|
|
47 |
' ', ETA(), ' '] |
|
|
48 |
self.pbar = ProgressBar(widgets=widgets, maxval=(self.max_evals * len(self.param_space))) |
|
|
49 |
self.pbar.start() |
|
|
50 |
|
|
|
51 |
objective = lambda w: -np.mean([roc_auc_score(y[:, col], |
|
|
52 |
self.calcMean(X[:, col::6], w, training=True)) |
|
|
53 |
for col in range(6)]) |
|
|
54 |
|
|
|
55 |
self.best_params = fmin(objective, self.param_space, algo=tpe.suggest, |
|
|
56 |
max_evals=self.max_evals) |
|
|
57 |
|
|
|
58 |
if self.verbose: |
|
|
59 |
print(self.best_params) |
|
|
60 |
else: |
|
|
61 |
self.best_params = None |
|
|
62 |
|
|
|
63 |
def predict_proba(self, X): |
|
|
64 |
"""Get predictions.""" |
|
|
65 |
return np.c_[[self.calcMean(X[:, col::6], self.best_params) |
|
|
66 |
for col in range(6)]].transpose() |
|
|
67 |
|
|
|
68 |
def calcMean(self, X, w, training = False): |
|
|
69 |
"""Calculate Mean according to weights.""" |
|
|
70 |
self.count += 1 |
|
|
71 |
if self.verbose and self.count <= (self.max_evals * len(self.param_space)) and not self.count%10 and training: |
|
|
72 |
self.pbar.update(self.count) |
|
|
73 |
|
|
|
74 |
if self.mean == 'simple': |
|
|
75 |
return np.sum(X, axis=1)/X.shape[1] |
|
|
76 |
else: |
|
|
77 |
w = [w[k] for k in sorted(self.sorting, key=self.sorting.get)] |
|
|
78 |
if self.mean == 'arithmetic': |
|
|
79 |
return np.sum(X * w, axis=1)/np.sum(w) |
|
|
80 |
elif self.mean == 'geometric': |
|
|
81 |
return np.exp(np.sum(np.log(X) * w, axis=1)/np.sum(w)) |
|
|
82 |
elif self.mean == 'power': |
|
|
83 |
return 1/(1+np.exp(-np.sum(X ** w, axis=1))) |
|
|
84 |
else: |
|
|
85 |
print 'Mean should be either "simple", "arithmetic", "geometric" or "power"' |