Insights-into-the-I-SPY-c / Git / [1654c6] /ispy1/predictive_statistics

Models:
joseph-gordon/
Insights-into-the-I-SPY-c
Downloads: 1
[1654c6]: / ispy1 / predictive_statistics_or.py
History
Download this file
185 lines (150 with data), 7.0 kB

# quantify the effect of age on Survival
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RFC

def labels_to_numbers(DataFrame, Variable):
    le = preprocessing.LabelEncoder()
    numbers_ = le.fit_transform(DataFrame[Variable].values)
    return numbers_

def plot_roc_curve(fpr, tpr, lw = 2, title=''):
    auc = metrics.auc(fpr,tpr);
    plt.figure(figsize =(6,6))
    plt.plot(fpr, tpr, color='darkorange',
                 lw=lw, label='ROC curve (area = %0.2f)' % auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.show()


def TrainRFC(Xdata,ydata):
    clf= RFC()
    # specify parameters and distributions to sample from
    Forest  = GridSearchCV(clf, param_grid = {"n_estimators": np.arange(10, 100,10),
                                                "max_features": np.arange(1,Xdata.shape[1],1)},
                                                   scoring = make_scorer(cohen_kappa_score),
                                                   verbose = 1, n_jobs = -1);
    Forest.fit(Xdata,ydata);
    return Forest.best_estimator_

def TrainLogRegModel_Kappa(Xdata, ydata):
    clf = LogisticRegression()
    LogRegModel = GridSearchCV(clf, param_grid = {"C": np.arange(1,11,1),
                                                  "fit_intercept": ["True", "False"]},
                                                   scoring = make_scorer(cohen_kappa_score),
                                                   verbose = 0);


    LogRegModel.fit(Xdata,ydata);
    return LogRegModel


def _LogisticRegression(X,y, title =''):
    # Split
    X_train, X_test, y_train, y_test = train_test_split(X,y,  train_size=0.50, stratify = y)
    # train
    clf = TrainLogRegModel_Kappa(X_train,y_train);
    pred_prob = clf.predict_proba(X_test)[:,1]
    fpr, tpr, _ = metrics.roc_curve(y_test, pred_prob);
    kappa = metrics.cohen_kappa_score(clf.predict(X_test),y_test)
    auc =   metrics.auc(fpr,tpr)
    plot_roc_curve(fpr,tpr, title = title)
    return  kappa, auc

def _RFClassifier(X,y, size_train = 0.50):
    # Split
    X_train, X_test, y_train, y_test = train_test_split(X,y,  train_size= size_train, stratify = y)
    # train
    clf = TrainRFC(X_train,y_train);
    print(metrics.classification.classification_report(clf.predict(X_test), y_test))
    return clf, X_test, y_test

# Plot the feature importances of the forest
def Tree_feature_importances(Forest):
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X.shape[1]):
        print("%d. feature %d (%f): " % (f + 1, indices[f], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
    plt.xticks(range(X.shape[1]), indices)
    plt.xlim([-1, X.shape[1]])
    plt.show()

import matplotlib.pyplot as plt
import numpy as np
from sklearn import model_selection as ms
from imblearn import pipeline as pl
from sklearn.model_selection import train_test_split


def validation_curve(Classifier, X, y,parameter_to_optimize, scorer, parameter_range = np.arange(1,5,1), c_v = 3):
    train_scores, test_scores = ms.validation_curve(
                                                 Classifier,
                                                  X, y,
                                                   param_name = parameter_to_optimize, param_range = parameter_range,
                                                    cv= c_v, scoring = scorer, n_jobs=1)

    idx = np.argmax(np.median(test_scores, axis = 1))


    return train_scores, test_scores, parameter_range[idx]

def plot_with_errors(ydata):
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    plt.plot(param_range, test_scores_mean, label='mean of metric')
    ax.fill_between(param_range, test_scores_mean + test_scores_std,
                test_scores_mean - test_scores_std, alpha=0.2)
    plt.show()

def plot_validation_curve(train_scores, test_scores, param_range, xlabel='x', ylabel='y', title =''):
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    plt.plot(param_range, test_scores_mean, label='mean')
    ax.fill_between(param_range, test_scores_mean + test_scores_std,
                test_scores_mean - test_scores_std, alpha=0.2)

    idx_max = np.argmax(np.mean(test_scores, axis=1))

    plt.scatter(param_range[idx_max], test_scores_mean[idx_max],
            label=r'Cohen Kappa: ${0:.2f}\pm{1:.2f}$'.format(
                test_scores_mean[idx_max], test_scores_std[idx_max]))

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

    # make nice plotting
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines['left'].set_position(('outward', 10))
    ax.spines['bottom'].set_position(('outward', 10))
    #plt.xlim([1, 10])
    #plt.ylim([0.4, 0.8])

    plt.legend(loc="best")
    plt.show()

def classification_report(y_expected, yhat):
    #  test performance
    print(20 * '---')
    print('Observed Performance')
    print(20 * '---')
    print(metrics.classification_report(y_expected, yhat))


    index_largest_class = np.argmax(pd.Series(y_expected).value_counts().values)
    index_smallest_class = np.argmin(pd.Series(y_expected).value_counts().values)
    largest_class = pd.Series(y_expected).value_counts().index[index_largest_class]
    small_class = pd.Series(y_expected).value_counts().index[index_smallest_class]

    y_hat_crazy = np.zeros_like(yhat)
    y_hat_crazy[:] = largest_class
    y_hat_crazy[0] = small_class
    size = y_hat_crazy.shape[0] - 1

    # How would this look if I predict everything belong to the largest class?
    print(20 * '---')
    print('Performance assuming '+' '+str(size)+' observations belong to the largest class')
    print(20 * '---')
    print(metrics.classification_report(y_expected, y_hat_crazy))