Grasp-and-lift / Git / [21363a] /ensembling/XGB.py

Models:
ReneeD/
Grasp-and-lift
Downloads: 3
[21363a]: / ensembling / XGB.py
History
Download this file
101 lines (79 with data), 3.9 kB

# -*- coding: utf-8 -*-
"""
Created on Sat Aug 15 21:19:51 2015.

@author: rc, alex
"""

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin

from progressbar import Bar, ETA, Percentage, ProgressBar, RotatingMarker

from preprocessing.aux import delay_preds
import xgboost as xgb


class XGB(BaseEstimator, ClassifierMixin):

    """Ensembling with eXtreme Gradient Boosting."""

    def __init__(self, ensemble, n_estimators=100, max_depth=5, subsample=0.7,
                 nthread=12,delay=None,skip=None,subsample_data=1,partsTest=1, jump=None):
        """Init."""
        self.ensemble = ensemble
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.subsample = subsample
        self.nthread = nthread
        
        ### timecourse history parameters ###
        # how many past time samples to include along with the most recent sample
        self.applyPreds = delay is not None and skip is not None    
        # how many past time samples to include along with the most recent sample  
        self.delay = delay
        # subsample above samples
        self.skip = skip
        # here can be set a custom subsampling scheme, it overrides previous params
        self.jump = jump
        
        # due to RAM limitations testing data has to be split into 'partsTest' parts
        self.partsTest = partsTest
        
        # subsampling input data as an efficient form of regularization
        self.subsample_data = subsample_data
        
        # used in bagging to set different starting points when subsampling the data
        self.mdlNr = 0

        self.clf = []

    def fit(self, X, y):
        """Fit."""
        
        X = X[(self.mdlNr*5 % self.subsample_data)::self.subsample_data]
        y = y[(self.mdlNr*5 % self.subsample_data)::self.subsample_data]
        
        if self.applyPreds:
            if self.jump is not None:
                X = delay_preds(X, delay=self.delay/self.subsample_data, skip=self.skip/self.subsample_data, jump=self.jump/self.subsample_data)
            else:
                X = delay_preds(X, delay=self.delay/self.subsample_data, skip=self.skip/self.subsample_data)
        self.clf = []
        
        widgets = ['Training : ', Percentage(), ' ', Bar(marker=RotatingMarker()),
           ' ', ETA(), ' ']
        pbar = ProgressBar(widgets=widgets, maxval=6)
        pbar.start()
        
        # training separate models for each event
        for col in range(6):
            self.clf.append(xgb.XGBClassifier(n_estimators=self.n_estimators,
                                              max_depth=self.max_depth,
                                              subsample=self.subsample,
                                              nthread=self.nthread))
            self.clf[col].fit(X, y[:, col])
            pbar.update(col)

    def _predict_proba(self,X):
        """Predict probability for each event separately, then concatenate results."""
        pred = []
        for col in range(6):
            pred.append(self.clf[col].predict_proba(X)[:, 1])
        pred = np.vstack(pred).transpose()
        return pred
        
    def predict_proba(self, X):
        """Predict probability."""
        if self.applyPreds:
            p = np.zeros((X.shape[0],6))
            for part in range(self.partsTest):
                start = part*X.shape[0]//self.partsTest-self.delay*(part>0)
                stop = (part+1)*X.shape[0]//self.partsTest
                X_delayed = delay_preds(X[slice(start,stop)], delay=self.delay, skip=self.skip, jump=self.jump)[self.delay*(part>0):]
                start += self.delay*(part>0)
                p[slice(start,stop)] += self._predict_proba(X_delayed)
                X_delayed = None
            return p
        else:
            return self._predict_proba(X)