Switch to side-by-side view

--- a
+++ b/ispy1/predictive_statistics.py
@@ -0,0 +1,255 @@
+from sklearn import preprocessing
+from sklearn import metrics
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn import metrics, linear_model
+from imblearn import over_sampling
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn.ensemble import RandomForestClassifier as RFC
+import numpy as np
+import matplotlib.pyplot as plt
+
+RANDOM_STATE = 42; # for reproducibility
+
+
+def labels_to_numbers(DataFrame, Variable):
+    le = preprocessing.LabelEncoder()
+    numbers_ = le.fit_transform(DataFrame[Variable].values)
+    return numbers_
+
+
+def binary_classifier_metrics(classifier, Xtrain, Ytrain, Xtest, Ytest):
+    # metrics
+    predicted_class = classifier.predict(Xtest)
+
+    kappa = metrics.cohen_kappa_score(Ytest,predicted_class)
+    np.round(kappa,3)
+
+    auc = metrics.roc_auc_score(Ytest,predicted_class)
+    auc = np.round(auc,3)
+
+
+    # ROC curve
+    probability = classifier.predict_proba(Xtest)
+    fpr, tpr, _  = metrics.roc_curve(Ytest, probability[:,1], drop_intermediate=False)
+
+    # report
+    print(metrics.classification_report(Ytest,predicted_class))
+    print('The estimated Cohen kappa is ' + str(kappa))
+    print('The estimated AUC is ' + str(auc))
+    print('=='*30)
+    print('\n'*2)
+
+    return auc, kappa, fpr, tpr
+
+
+def split_data(Xdata,Ydata, oversample, K_neighbors = 4):
+    if oversample == False:
+        X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata,
+                                                            train_size = 0.70,
+                                                            random_state=RANDOM_STATE)
+    elif oversample == True:
+        print('Data was oversampled using the ADASYN method')
+        smote = over_sampling.ADASYN(random_state = RANDOM_STATE, n_neighbors = K_neighbors)
+        # split
+        X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata,train_size = 0.70, random_state=RANDOM_STATE)
+        X_train, y_train = smote.fit_sample(X_train,y_train)
+
+
+        # oversample the train sets
+        #X_over, y_over = smote.fit_sample(Xdata,Ydata)
+        #X_train, X_test, y_train, y_test = train_test_split(X_over, y_over,train_size = 0.70,random_state=RANDOM_STATE, stratify = y_over)
+
+    return X_train, X_test, y_train, y_test
+
+
+# perform Logistic Regression without correcting for unbalance
+def Logistic_Regression(Xdata, Ydata, oversample = False, K_neighbors = 4):
+    '''
+    Perform Logistic Regression optimizing C, penalty, and fit_intercept to maximize
+    Cohen kappa (min  = 0, max = 1.0)
+    '''
+
+    # split data
+    X_train, X_test, y_train, y_test = split_data(Xdata,Ydata, oversample, K_neighbors)
+
+    # train and tune parameters using GridSearchCV
+    pars= dict(   C = np.arange(.01,100,.1),
+                  penalty = ['l2', 'l1'],
+                  fit_intercept = [True,False])
+
+    grid =  GridSearchCV(  linear_model.LogisticRegression(), param_grid = pars,
+                           scoring = metrics.make_scorer(metrics.cohen_kappa_score),
+                           cv= 5, verbose = 0, n_jobs = -1)
+
+    # fit
+    grid.fit(X_train,y_train)
+
+    # metrics
+    auc, kappa, fpr, tpr = binary_classifier_metrics(grid, X_train, y_train, X_test, y_test)
+
+    # output
+    return auc, kappa, fpr, tpr
+
+# Random Forest Regressor
+def RandomForest_Classifier(Xdata, Ydata, oversample = False, K_neighbors = 4, calibrate_prob = True):
+
+    # split data
+    X_train, X_test, y_train, y_test = split_data(Xdata,Ydata, oversample, K_neighbors)
+
+    # define parameter grid search
+    pars = dict(    n_estimators = np.arange(1,10,1),
+                    max_features = np.arange(1, Xdata.shape[1], 1),
+                    max_depth = [None, 1, 2, 3, 4, 5])
+
+    # perform grid search
+    grid=  GridSearchCV(RFC( random_state = RANDOM_STATE),param_grid = pars,
+                           scoring = metrics.make_scorer(metrics.cohen_kappa_score),
+                           cv= 3, verbose = 0, n_jobs = -1)
+
+    # fit
+    grid.fit(X_train,y_train)
+
+    # get best classifier and calibrate it
+    clv = CalibratedClassifierCV(base_estimator = grid.best_estimator_ , method='sigmoid', cv=3)
+
+
+    # metrics
+    auc, kappa, fpr, tpr = binary_classifier_metrics(grid, X_train, y_train, X_test, y_test)
+
+    # output
+    return auc, kappa, fpr, tpr, grid.best_estimator_
+
+def plot_forest_feature_importances_(forest, features_legend, title = ''):
+    importances = forest.feature_importances_;
+    importances = importances / np.max(importances)
+    sorted_index = np.argsort(importances)
+
+    x = range(len(importances));
+
+    plt.figure()
+    plt.barh(x, importances[sorted_index],color="b", align="center")
+
+    plt.yticks(sorted_index, features_legend);
+    plt.title(title);
+    plt.xlabel('RELATIVE IMPORTANCE');
+    plt.ylabel('PREDICTOR');
+    plt.show()
+
+
+def plot_compare_roc(fpr1_, tpr1_,fpr2_, tpr2_, auc1, auc2, title =''):
+    plt.figure()
+    plt.plot(fpr1_, tpr1_, fpr2_, tpr2_);
+    plt.legend(['Unbalanced | AUC = ' + str(auc1),'Oversampled | AUC = ' + str(auc2)]);
+    plt.xlabel('False-positive rate');
+    plt.ylabel('True-positive rate');
+    plt.title(title);
+
+
+
+##  ============== Continous Outcomes  ============== ##
+
+# metrics
+mae = metrics.median_absolute_error
+
+def mae_report(Ytest, Yhat, outcome_):
+    error = mae(Ytest, Yhat)
+    error = np.round( error, decimals=3)
+    # report
+    print('==' *40)
+    print('The median absolute error for testing data set of ' + outcome_ + ' is: ' + str(error))
+    print('==' *40)
+
+import seaborn.apionly as sns
+
+def train_test_report(predictor, Xtrain, Ytrain, Xtest, Ytest, outcome):
+    # train
+    predictor.fit(Xtrain, Ytrain)
+    # test
+    Yhat = predictor.predict(Xtest)
+    # report
+    mae_report(Ytest, Yhat, outcome)
+
+    ax = sns.regplot(x = Ytrain, y= predictor.predict(Xtrain));
+    ax.set_ylabel('Predicted ' + outcome);
+    ax.set_xlabel('Observed ' + outcome);
+
+# lsq
+import statsmodels.api as sm
+def lsq(Xtrain,Ytrain, Xtest, Ytest, outcome =''):
+    # train
+    OLS = sm.OLS(Ytrain,Xtrain).fit();
+    print(OLS.summary())
+    #test
+    Yhat = OLS.predict(Xtest)
+    # report
+    mae_report(Ytest, Yhat, outcome)
+
+# SVR
+from sklearn.svm import SVR
+from sklearn.model_selection import GridSearchCV
+
+# GridSearchCV utility
+def gridsearch(regressor, grid):
+    optimized_regressor=  GridSearchCV(  regressor,
+                               param_grid = grid,
+                               cv= 3, verbose = 0, n_jobs = -1,
+                               scoring = metrics.make_scorer(metrics.median_absolute_error))
+
+    return optimized_regressor
+
+
+
+def svr(Xtrain,Ytrain, Xtest, Ytest, outcome = ''):
+    # define regressor
+    regressor =  SVR()
+    # define parameter grid search
+    grid = dict(       kernel = ['rbf','linear','sigmoid'],
+                       C = np.arange(1,11,.1),
+                       epsilon = np.arange(1,11,.1),
+                       gamma = np.linspace(1/10,10,10))
+    # perform grid search
+    grid_search=  gridsearch(regressor, grid)
+
+    # train, test, and report
+    train_test_report(grid_search, Xtrain, Ytrain, Xtest, Ytest, outcome)
+
+    return grid_search
+
+
+# ElasticNet
+from sklearn.linear_model import ElasticNet as ENet
+
+def ElasticNet(Xtrain,Ytrain, Xtest, Ytest, outcome = ''):
+    # define regressor
+    regressor =  ENet(max_iter=5000)
+    # define parameter grid search
+    grid = dict(   alpha = np.arange(1,20,.5), l1_ratio = np.arange(.1,1,.05))
+    # perform grid search
+    grid_search=  gridsearch(regressor, grid)
+    # train, test, and report
+    train_test_report(grid_search, Xtrain, Ytrain, Xtest, Ytest, outcome)
+
+    return grid_search
+
+# Random Forest Regressor
+from sklearn.ensemble import RandomForestRegressor as RFR
+
+def RandomForestRegressor(Xtrain,Ytrain, Xtest, Ytest, outcome = ''):
+    # define regressor
+    regressor =  RFR( criterion='mse', random_state = RANDOM_STATE)
+
+    #
+    num_features = Xtrain.shape[1]
+
+    # define parameter grid search
+    grid = dict(    n_estimators = np.arange(5,100,5),
+                    max_features = np.arange(1,num_features, 1),
+                    max_depth = [None, 1, 2, 3, 4, 5])
+
+    # perform grid search
+    grid_search=  gridsearch(regressor, grid)
+
+    # train, test, and report
+    train_test_report(grid_search, Xtrain, Ytrain, Xtest, Ytest, outcome)
+
+    return grid_search