Diff of /ensembling/XGB.py [000000] .. [21363a]

Switch to unified view

a b/ensembling/XGB.py
1
# -*- coding: utf-8 -*-
2
"""
3
Created on Sat Aug 15 21:19:51 2015.
4
5
@author: rc, alex
6
"""
7
8
import numpy as np
9
from sklearn.base import BaseEstimator, ClassifierMixin
10
11
from progressbar import Bar, ETA, Percentage, ProgressBar, RotatingMarker
12
13
from preprocessing.aux import delay_preds
14
import xgboost as xgb
15
16
17
class XGB(BaseEstimator, ClassifierMixin):
18
19
    """Ensembling with eXtreme Gradient Boosting."""
20
21
    def __init__(self, ensemble, n_estimators=100, max_depth=5, subsample=0.7,
22
                 nthread=12,delay=None,skip=None,subsample_data=1,partsTest=1, jump=None):
23
        """Init."""
24
        self.ensemble = ensemble
25
        self.n_estimators = n_estimators
26
        self.max_depth = max_depth
27
        self.subsample = subsample
28
        self.nthread = nthread
29
        
30
        ### timecourse history parameters ###
31
        # how many past time samples to include along with the most recent sample
32
        self.applyPreds = delay is not None and skip is not None    
33
        # how many past time samples to include along with the most recent sample  
34
        self.delay = delay
35
        # subsample above samples
36
        self.skip = skip
37
        # here can be set a custom subsampling scheme, it overrides previous params
38
        self.jump = jump
39
        
40
        # due to RAM limitations testing data has to be split into 'partsTest' parts
41
        self.partsTest = partsTest
42
        
43
        # subsampling input data as an efficient form of regularization
44
        self.subsample_data = subsample_data
45
        
46
        # used in bagging to set different starting points when subsampling the data
47
        self.mdlNr = 0
48
49
        self.clf = []
50
51
    def fit(self, X, y):
52
        """Fit."""
53
        
54
        X = X[(self.mdlNr*5 % self.subsample_data)::self.subsample_data]
55
        y = y[(self.mdlNr*5 % self.subsample_data)::self.subsample_data]
56
        
57
        if self.applyPreds:
58
            if self.jump is not None:
59
                X = delay_preds(X, delay=self.delay/self.subsample_data, skip=self.skip/self.subsample_data, jump=self.jump/self.subsample_data)
60
            else:
61
                X = delay_preds(X, delay=self.delay/self.subsample_data, skip=self.skip/self.subsample_data)
62
        self.clf = []
63
        
64
        widgets = ['Training : ', Percentage(), ' ', Bar(marker=RotatingMarker()),
65
           ' ', ETA(), ' ']
66
        pbar = ProgressBar(widgets=widgets, maxval=6)
67
        pbar.start()
68
        
69
        # training separate models for each event
70
        for col in range(6):
71
            self.clf.append(xgb.XGBClassifier(n_estimators=self.n_estimators,
72
                                              max_depth=self.max_depth,
73
                                              subsample=self.subsample,
74
                                              nthread=self.nthread))
75
            self.clf[col].fit(X, y[:, col])
76
            pbar.update(col)
77
78
    def _predict_proba(self,X):
79
        """Predict probability for each event separately, then concatenate results."""
80
        pred = []
81
        for col in range(6):
82
            pred.append(self.clf[col].predict_proba(X)[:, 1])
83
        pred = np.vstack(pred).transpose()
84
        return pred
85
        
86
    def predict_proba(self, X):
87
        """Predict probability."""
88
        if self.applyPreds:
89
            p = np.zeros((X.shape[0],6))
90
            for part in range(self.partsTest):
91
                start = part*X.shape[0]//self.partsTest-self.delay*(part>0)
92
                stop = (part+1)*X.shape[0]//self.partsTest
93
                X_delayed = delay_preds(X[slice(start,stop)], delay=self.delay, skip=self.skip, jump=self.jump)[self.delay*(part>0):]
94
                start += self.delay*(part>0)
95
                p[slice(start,stop)] += self._predict_proba(X_delayed)
96
                X_delayed = None
97
            return p
98
        else:
99
            return self._predict_proba(X)
100