--- a +++ b/ispy1/predictive_statistics.py @@ -0,0 +1,255 @@ +from sklearn import preprocessing +from sklearn import metrics +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn import metrics, linear_model +from imblearn import over_sampling +from sklearn.calibration import CalibratedClassifierCV +from sklearn.ensemble import RandomForestClassifier as RFC +import numpy as np +import matplotlib.pyplot as plt + +RANDOM_STATE = 42; # for reproducibility + + +def labels_to_numbers(DataFrame, Variable): + le = preprocessing.LabelEncoder() + numbers_ = le.fit_transform(DataFrame[Variable].values) + return numbers_ + + +def binary_classifier_metrics(classifier, Xtrain, Ytrain, Xtest, Ytest): + # metrics + predicted_class = classifier.predict(Xtest) + + kappa = metrics.cohen_kappa_score(Ytest,predicted_class) + np.round(kappa,3) + + auc = metrics.roc_auc_score(Ytest,predicted_class) + auc = np.round(auc,3) + + + # ROC curve + probability = classifier.predict_proba(Xtest) + fpr, tpr, _ = metrics.roc_curve(Ytest, probability[:,1], drop_intermediate=False) + + # report + print(metrics.classification_report(Ytest,predicted_class)) + print('The estimated Cohen kappa is ' + str(kappa)) + print('The estimated AUC is ' + str(auc)) + print('=='*30) + print('\n'*2) + + return auc, kappa, fpr, tpr + + +def split_data(Xdata,Ydata, oversample, K_neighbors = 4): + if oversample == False: + X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata, + train_size = 0.70, + random_state=RANDOM_STATE) + elif oversample == True: + print('Data was oversampled using the ADASYN method') + smote = over_sampling.ADASYN(random_state = RANDOM_STATE, n_neighbors = K_neighbors) + # split + X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata,train_size = 0.70, random_state=RANDOM_STATE) + X_train, y_train = smote.fit_sample(X_train,y_train) + + + # oversample the train sets + #X_over, y_over = smote.fit_sample(Xdata,Ydata) + #X_train, X_test, y_train, y_test = train_test_split(X_over, y_over,train_size = 0.70,random_state=RANDOM_STATE, stratify = y_over) + + return X_train, X_test, y_train, y_test + + +# perform Logistic Regression without correcting for unbalance +def Logistic_Regression(Xdata, Ydata, oversample = False, K_neighbors = 4): + ''' + Perform Logistic Regression optimizing C, penalty, and fit_intercept to maximize + Cohen kappa (min = 0, max = 1.0) + ''' + + # split data + X_train, X_test, y_train, y_test = split_data(Xdata,Ydata, oversample, K_neighbors) + + # train and tune parameters using GridSearchCV + pars= dict( C = np.arange(.01,100,.1), + penalty = ['l2', 'l1'], + fit_intercept = [True,False]) + + grid = GridSearchCV( linear_model.LogisticRegression(), param_grid = pars, + scoring = metrics.make_scorer(metrics.cohen_kappa_score), + cv= 5, verbose = 0, n_jobs = -1) + + # fit + grid.fit(X_train,y_train) + + # metrics + auc, kappa, fpr, tpr = binary_classifier_metrics(grid, X_train, y_train, X_test, y_test) + + # output + return auc, kappa, fpr, tpr + +# Random Forest Regressor +def RandomForest_Classifier(Xdata, Ydata, oversample = False, K_neighbors = 4, calibrate_prob = True): + + # split data + X_train, X_test, y_train, y_test = split_data(Xdata,Ydata, oversample, K_neighbors) + + # define parameter grid search + pars = dict( n_estimators = np.arange(1,10,1), + max_features = np.arange(1, Xdata.shape[1], 1), + max_depth = [None, 1, 2, 3, 4, 5]) + + # perform grid search + grid= GridSearchCV(RFC( random_state = RANDOM_STATE),param_grid = pars, + scoring = metrics.make_scorer(metrics.cohen_kappa_score), + cv= 3, verbose = 0, n_jobs = -1) + + # fit + grid.fit(X_train,y_train) + + # get best classifier and calibrate it + clv = CalibratedClassifierCV(base_estimator = grid.best_estimator_ , method='sigmoid', cv=3) + + + # metrics + auc, kappa, fpr, tpr = binary_classifier_metrics(grid, X_train, y_train, X_test, y_test) + + # output + return auc, kappa, fpr, tpr, grid.best_estimator_ + +def plot_forest_feature_importances_(forest, features_legend, title = ''): + importances = forest.feature_importances_; + importances = importances / np.max(importances) + sorted_index = np.argsort(importances) + + x = range(len(importances)); + + plt.figure() + plt.barh(x, importances[sorted_index],color="b", align="center") + + plt.yticks(sorted_index, features_legend); + plt.title(title); + plt.xlabel('RELATIVE IMPORTANCE'); + plt.ylabel('PREDICTOR'); + plt.show() + + +def plot_compare_roc(fpr1_, tpr1_,fpr2_, tpr2_, auc1, auc2, title =''): + plt.figure() + plt.plot(fpr1_, tpr1_, fpr2_, tpr2_); + plt.legend(['Unbalanced | AUC = ' + str(auc1),'Oversampled | AUC = ' + str(auc2)]); + plt.xlabel('False-positive rate'); + plt.ylabel('True-positive rate'); + plt.title(title); + + + +## ============== Continous Outcomes ============== ## + +# metrics +mae = metrics.median_absolute_error + +def mae_report(Ytest, Yhat, outcome_): + error = mae(Ytest, Yhat) + error = np.round( error, decimals=3) + # report + print('==' *40) + print('The median absolute error for testing data set of ' + outcome_ + ' is: ' + str(error)) + print('==' *40) + +import seaborn.apionly as sns + +def train_test_report(predictor, Xtrain, Ytrain, Xtest, Ytest, outcome): + # train + predictor.fit(Xtrain, Ytrain) + # test + Yhat = predictor.predict(Xtest) + # report + mae_report(Ytest, Yhat, outcome) + + ax = sns.regplot(x = Ytrain, y= predictor.predict(Xtrain)); + ax.set_ylabel('Predicted ' + outcome); + ax.set_xlabel('Observed ' + outcome); + +# lsq +import statsmodels.api as sm +def lsq(Xtrain,Ytrain, Xtest, Ytest, outcome =''): + # train + OLS = sm.OLS(Ytrain,Xtrain).fit(); + print(OLS.summary()) + #test + Yhat = OLS.predict(Xtest) + # report + mae_report(Ytest, Yhat, outcome) + +# SVR +from sklearn.svm import SVR +from sklearn.model_selection import GridSearchCV + +# GridSearchCV utility +def gridsearch(regressor, grid): + optimized_regressor= GridSearchCV( regressor, + param_grid = grid, + cv= 3, verbose = 0, n_jobs = -1, + scoring = metrics.make_scorer(metrics.median_absolute_error)) + + return optimized_regressor + + + +def svr(Xtrain,Ytrain, Xtest, Ytest, outcome = ''): + # define regressor + regressor = SVR() + # define parameter grid search + grid = dict( kernel = ['rbf','linear','sigmoid'], + C = np.arange(1,11,.1), + epsilon = np.arange(1,11,.1), + gamma = np.linspace(1/10,10,10)) + # perform grid search + grid_search= gridsearch(regressor, grid) + + # train, test, and report + train_test_report(grid_search, Xtrain, Ytrain, Xtest, Ytest, outcome) + + return grid_search + + +# ElasticNet +from sklearn.linear_model import ElasticNet as ENet + +def ElasticNet(Xtrain,Ytrain, Xtest, Ytest, outcome = ''): + # define regressor + regressor = ENet(max_iter=5000) + # define parameter grid search + grid = dict( alpha = np.arange(1,20,.5), l1_ratio = np.arange(.1,1,.05)) + # perform grid search + grid_search= gridsearch(regressor, grid) + # train, test, and report + train_test_report(grid_search, Xtrain, Ytrain, Xtest, Ytest, outcome) + + return grid_search + +# Random Forest Regressor +from sklearn.ensemble import RandomForestRegressor as RFR + +def RandomForestRegressor(Xtrain,Ytrain, Xtest, Ytest, outcome = ''): + # define regressor + regressor = RFR( criterion='mse', random_state = RANDOM_STATE) + + # + num_features = Xtrain.shape[1] + + # define parameter grid search + grid = dict( n_estimators = np.arange(5,100,5), + max_features = np.arange(1,num_features, 1), + max_depth = [None, 1, 2, 3, 4, 5]) + + # perform grid search + grid_search= gridsearch(regressor, grid) + + # train, test, and report + train_test_report(grid_search, Xtrain, Ytrain, Xtest, Ytest, outcome) + + return grid_search