Diff of /ensembling/XGB.py [000000] .. [21363a]

Switch to side-by-side view

--- a
+++ b/ensembling/XGB.py
@@ -0,0 +1,100 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Aug 15 21:19:51 2015.
+
+@author: rc, alex
+"""
+
+import numpy as np
+from sklearn.base import BaseEstimator, ClassifierMixin
+
+from progressbar import Bar, ETA, Percentage, ProgressBar, RotatingMarker
+
+from preprocessing.aux import delay_preds
+import xgboost as xgb
+
+
+class XGB(BaseEstimator, ClassifierMixin):
+
+    """Ensembling with eXtreme Gradient Boosting."""
+
+    def __init__(self, ensemble, n_estimators=100, max_depth=5, subsample=0.7,
+                 nthread=12,delay=None,skip=None,subsample_data=1,partsTest=1, jump=None):
+        """Init."""
+        self.ensemble = ensemble
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.subsample = subsample
+        self.nthread = nthread
+        
+        ### timecourse history parameters ###
+        # how many past time samples to include along with the most recent sample
+        self.applyPreds = delay is not None and skip is not None    
+        # how many past time samples to include along with the most recent sample  
+        self.delay = delay
+        # subsample above samples
+        self.skip = skip
+        # here can be set a custom subsampling scheme, it overrides previous params
+        self.jump = jump
+        
+        # due to RAM limitations testing data has to be split into 'partsTest' parts
+        self.partsTest = partsTest
+        
+        # subsampling input data as an efficient form of regularization
+        self.subsample_data = subsample_data
+        
+        # used in bagging to set different starting points when subsampling the data
+        self.mdlNr = 0
+
+        self.clf = []
+
+    def fit(self, X, y):
+        """Fit."""
+        
+        X = X[(self.mdlNr*5 % self.subsample_data)::self.subsample_data]
+        y = y[(self.mdlNr*5 % self.subsample_data)::self.subsample_data]
+        
+        if self.applyPreds:
+            if self.jump is not None:
+                X = delay_preds(X, delay=self.delay/self.subsample_data, skip=self.skip/self.subsample_data, jump=self.jump/self.subsample_data)
+            else:
+                X = delay_preds(X, delay=self.delay/self.subsample_data, skip=self.skip/self.subsample_data)
+        self.clf = []
+        
+        widgets = ['Training : ', Percentage(), ' ', Bar(marker=RotatingMarker()),
+           ' ', ETA(), ' ']
+        pbar = ProgressBar(widgets=widgets, maxval=6)
+        pbar.start()
+        
+        # training separate models for each event
+        for col in range(6):
+            self.clf.append(xgb.XGBClassifier(n_estimators=self.n_estimators,
+                                              max_depth=self.max_depth,
+                                              subsample=self.subsample,
+                                              nthread=self.nthread))
+            self.clf[col].fit(X, y[:, col])
+            pbar.update(col)
+
+    def _predict_proba(self,X):
+        """Predict probability for each event separately, then concatenate results."""
+        pred = []
+        for col in range(6):
+            pred.append(self.clf[col].predict_proba(X)[:, 1])
+        pred = np.vstack(pred).transpose()
+        return pred
+        
+    def predict_proba(self, X):
+        """Predict probability."""
+        if self.applyPreds:
+            p = np.zeros((X.shape[0],6))
+            for part in range(self.partsTest):
+                start = part*X.shape[0]//self.partsTest-self.delay*(part>0)
+                stop = (part+1)*X.shape[0]//self.partsTest
+                X_delayed = delay_preds(X[slice(start,stop)], delay=self.delay, skip=self.skip, jump=self.jump)[self.delay*(part>0):]
+                start += self.delay*(part>0)
+                p[slice(start,stop)] += self._predict_proba(X_delayed)
+                X_delayed = None
+            return p
+        else:
+            return self._predict_proba(X)
+