MRI_Segmentation_Radiomic / Git / [7c5f70] /Radiomics/radiomic

Models:
SCallahan/
MRI_Segmentation_Radiomic
Downloads: 1
[7c5f70]: / Radiomics / radiomic_functions.py
History
Download this file
963 lines (715 with data), 28.2 kB

import numpy as np
import pandas as pd
from scipy import interp
import matplotlib.pyplot as plt
from sklearn import manifold
from sklearn.metrics import euclidean_distances, roc_curve, roc_auc_score, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA, KernelPCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.externals.six import StringIO
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold, ShuffleSplit, GridSearchCV
from sklearn import preprocessing
import seaborn as sns

# Set up plotting properties
sns.set_style('whitegrid')
sns.set_context("paper")

RAND_STATE = 1
RAND_STATE_CL = 42
NSPLITS = 5


def scale_features(data):
    """
    Scale features by their mean and std
    Args:
        data (pandas dataframe): dataframe containing only the numeric
            radiomic values

    Returns:

    """

    df = data.copy()
    for key in data.keys():

        d = data[key]

        d = (d - d.mean()) / d.std()

        df[key] = d

    return df


def bar_plot(data):

    plt.figure(figsize=(25, 10)), data.iloc[0].plot.bar(), plt.show()


def svm_cross(x, y):

    # Attempt an SVM classifier
    plt.close('all')
    fig = plt.figure()

    cv = StratifiedKFold(n_splits=NSPLITS, random_state=RAND_STATE)

    clfsvm = SVC(kernel='rbf', gamma=0.01, C=5, probability=True, random_state=RAND_STATE_CL, verbose=0)

    clf = Pipeline([
        ('scaler', StandardScaler()),
        ('svm_clf', clfsvm)
        ])

    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    # Grid search
    param = [
        {
            "svm_clf__kernel": ["sigmoid", "rbf"],
            "svm_clf__C": [0.05, 0.1, 0.5, 0.75, 1, 10, 100, 1000],
            "svm_clf__gamma": [5, 1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
        }
    ]

    grid = GridSearchCV(clf, param_grid=param, n_jobs=4, cv=cv, verbose=0, iid=False)
    grid.fit(x, y)

    print("\nBest parameters set: ")
    print(grid.best_params_)

    # Assign best estimator
    clf = grid.best_estimator_

    i = 0
    train_score = []
    test_score = []
    for train, test in cv.split(x, y):

        print('Size of train set: %d\t\tSize of test set: %d\n' %(len(train), len(test)))

        # Compute probabilities
        probas_ = clf.fit(x[train], y[train]).predict_proba(x[test])

        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)

        train_score.append(clf.score(x[train], y[train]))
        test_score.append(clf.score(x[test], y[test]))
        print("Training set score: %f" % train_score[i])
        print("Test set score: %f\n\n" % test_score[i])

        i += 1

    print('Mean training score %f' % np.mean(train_score))
    print('Mean testing  score %f' % np.mean(test_score))

    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
             label='Chance', alpha=.8)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    plt.plot(mean_fpr, mean_tpr, color='b',
             label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
             lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                     label=r'$\pm$ 1 std. dev.')

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")

    return fig  #plt.savefig(sname, dpi=300)


def grid_search(clf, x, y, cv, params):

    net = []
    for layers in range(2, 4):

        for nodes in range(20, 110, 20):
            cur = []
            for n in range(layers - 1):

                cur.append(nodes)

            cur.append(1)

            net.append(cur)

    # net = [(50, 25, 1),
    #        (75, 50, 25, 1),
    #        (10, 25, 10, 1),
    #        (100, 100, 1),
    #        (100, 150, 1),
    #        (100, 50, 1),
    #        (50, 100, 50, 1),
    #        (25, 50, 25, 10, 1),
    #        (10, 20, 20, 10, 1)]
    # 
    # params = {'nn_clf__hidden_layer_sizes': net, #[(100, 100, 1), (50, 50, 1), (25, 25, 1), (10, 10, 1)],
    #           # 'nn_clf__learning_rate': ['adaptive', 'invscaling'],
    #           'nn_clf__learning_rate_init': [1e-4, 1e-3, 1e-2]}

    from sklearn.model_selection import GridSearchCV
    from sklearn.metrics import f1_score, make_scorer, roc_auc_score
    scorer = make_scorer(roc_auc_score)
    # Define grid search
    search = GridSearchCV(clf, params, cv=cv, scoring=scorer, n_jobs=-1)

    # Fit
    search.fit(x, y)

    cvres = search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres['params']):
        print(np.sqrt(mean_score), params)

    print('Best params: ', search.best_params_)
    print('Best score: ', search.best_score_)


def neural_network_cross(x, y):

    plt.close('all')
    fig = plt.figure() #figsize = (20, 15))

    sz = x.shape

    cv = StratifiedKFold(n_splits=NSPLITS, random_state=RAND_STATE)

    hidden_layer_sizes = (50, 25, )
    clfnn = MLPClassifier(solver='sgd', max_iter=5000, verbose=0, tol=1e-4, alpha=1e-4,
                        hidden_layer_sizes=hidden_layer_sizes, learning_rate_init=0.001, random_state=RAND_STATE_CL)

    clf = Pipeline([
        ('scaler', StandardScaler()),
        ('nn_clf', clfnn)
                    ])

    # Grid search
    net = [(100, ),
           (50, 25, )]
    param = [
        {
            "nn_clf__solver": ["sgd", "adam"],
            "nn_clf__alpha": [1e-5, 1e-4, 1e-3],
            "nn_clf__learning_rate_init": [1e-3, 1e-4],
            "nn_clf__hidden_layer_sizes": net,
        }
    ]

    grid = GridSearchCV(clf, param_grid=param, n_jobs=4, cv=cv, verbose=0, iid=False)
    grid.fit(x, y)

    print("\nBest parameters set: ")
    print(grid.best_params_)

    # Assign best estimator
    clf = grid.best_estimator_

    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    train_score = []
    test_score = []
    i = 0
    for train, test in cv.split(x, y):

        print('Size of train set: %d\t\tSize of test set: %d\n' %(len(train), len(test)))

        # Compute probablilities
        probas_ = clf.fit(x[train], y[train]).predict_proba(x[test])

        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)

        # Get training scores
        train_score.append(clf.score(x[train], y[train]))
        test_score.append(clf.score(x[test], y[test]))
        print("Training set score: %f" % train_score[i])
        print("Test set score: %f\n\n" % test_score[i])

        i += 1

    print('Mean training score %f' % np.mean(train_score))
    print('Mean testing  score %f' % np.mean(test_score))

    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
             label='Chance', alpha=.8)

    mean_tpr = np.nanmean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.nanstd(aucs)
    plt.plot(mean_fpr, mean_tpr, color='b',
             label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
             lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                     label=r'$\pm$ 1 std. dev.')

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")

    # Perform a grid search
    # net = [(50, 25, 1),
    #        (75, 50, 25, 1),
    #        (10, 25, 10, 1),
    #        (100, 100, 1),
    #        (100, 150, 1),
    #        (100, 50, 1),
    #        (50, 100, 50, 1),
    #        (25, 50, 25, 10, 1),
    #        (10, 20, 20, 10, 1)]
    #
    # params = {'nn_clf__hidden_layer_sizes': net, #[(100, 100, 1), (50, 50, 1), (25, 25, 1), (10, 10, 1)],
    #           # 'nn_clf__learning_rate': ['adaptive', 'invscaling'],
    #           'nn_clf__learning_rate_init': [1e-4, 1e-3, 1e-2]}
    # grid_search(clf, x, y, cv, params)

    return fig


def svm_classifier(data, inds_1, inds_2):

    # Convert pandas df to array
    np.random.seed(42)
    array = data.copy()
    num_samps = array.shape[0]
    num_test = int(len(inds_1) * 0.25)
    test_inds = np.arange(0, num_samps)
    np.random.shuffle(test_inds)
    train_inds = test_inds[num_test:]
    test_inds = test_inds[:num_test]
    train_inds_array = np.zeros(num_samps, dtype=np.int)
    train_inds_array[train_inds] = 1

    # Create labels
    y = np.zeros(shape=(array.shape[0]))
    y[inds_2] = 1

    print('Test distribution: True: %d and False: %d' %(np.sum(y[test_inds]), len(test_inds) - np.sum(y[test_inds])))

    # Attempt an SVM classifier
    from sklearn.svm import SVC
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import PolynomialFeatures, StandardScaler

    # poly_kernel_svm_clf = SVC()
    poly_kernel_svm_clf = Pipeline([
        ('scaler', StandardScaler()),
        ('svm_clf', SVC(kernel='rbf', probability=True, random_state=RAND_STATE_CL))
        ])
    poly_kernel_svm_clf.fit(array[train_inds_array == 1], y[train_inds_array==1])

    correct = 0
    for z in train_inds:
        ans = poly_kernel_svm_clf.predict([array[z, :]])
        # print('Predicted: %d' % ans)
        # print('Actual: %d' % y[z])

        if ans == y[z]:
            correct += 1

    pcorrect = correct / len(train_inds)
    print('\tTraining data fit: %f' % pcorrect)

    correct = 0
    for z in test_inds:
        ans = poly_kernel_svm_clf.predict([array[z, :]])
        # print('Predicted: %d' % ans)
        # print('Actual: %d' % y[z])

        if ans == y[z]:
            correct += 1

    pcorrect = correct / len(test_inds)
    print('\tTest data fit: %f' % pcorrect)

    # Compute probabilities
    # Run classifier with cross-validation and plot ROC curves
    plt.close('all')
    fig = plt.figure(figsize = (20, 15))

    cv = StratifiedKFold(n_splits=6)

    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    i = 0
    for train, test in cv.split(array, y):
        probas_ = poly_kernel_svm_clf.fit(array[train], y[train]).predict_proba(array[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        plt.plot(fpr, tpr, lw=1, alpha=0.3,
                 label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

        i += 1
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
             label='Chance', alpha=.8)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    plt.plot(mean_fpr, mean_tpr, color='b',
             label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
             lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                     label=r'$\pm$ 1 std. dev.')

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    # plt.savefig('roc.png', dpi=300)
    # plt.show()
    # fig.close()

    # Plot SVM
    if data.shape[1] == 2:
        # from sklearn.decomposition import PCA
        # pca = PCA(n_components=2).fit(array)
        # X = pca.transform(array)
        X = data.copy()

        # CT
        fig = plt.figure(21)
        plt.clf()
        plt.scatter(X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.Paired,
                    edgecolor='k', s=20)

        # Circle out the test data
        # plt.scatter(lle_CT[:, 0], lle_CT[:, 1], s=80, facecolors='none',
        #             zorder=10, edgecolor='k')

        plt.axis('tight')
        x_min = X[:, 0].min()
        x_max = X[:, 0].max()
        y_min = X[:, 1].min()
        y_max = X[:, 1].max()
        XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
        Z = poly_kernel_svm_clf.decision_function(np.c_[XX.ravel(), YY.ravel()])

        # Put the result into a color plot
        Z = Z.reshape(XX.shape)
        plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
        plt.contour(XX, YY, Z, colors=['k', 'k', 'k'],
                    linestyles=['--', '-', '--'], levels=[-.5, 0, .5])
        plt.title('CT: SVM on 2 Principal Components')
        # plt.savefig('Figures/LLE_CT_SVM_classifier.png')
        # plt.show()

    else:
        fig = []

    return fig, poly_kernel_svm_clf


def decision_tree_classifier(data, inds_1, inds_2, num_test, max_depth, export):

    # Convert pandas df to array
    array = data.copy()
    num_samps = array.shape[0]
    # num_test =
    test_inds = np.arange(0, num_samps)
    np.random.shuffle(test_inds)
    train_inds = test_inds[num_test:]
    test_inds = test_inds[:num_test]
    train_inds_array = np.zeros(num_samps, dtype=np.int)
    train_inds_array[train_inds] = 1

    # Create labels
    y = np.zeros(shape=(array.shape[0]), dtype=np.int)
    y[inds_2] = 1

    # Set up tree classifier
    clf = DecisionTreeClassifier(max_depth=max_depth)
    clf.fit(array[train_inds_array == 1], y[train_inds_array == 1])

    # Evaluate the fit of the training data
    ans = clf.predict_proba(array[train_inds_array == 1])
    cor = y[train_inds_array == 1]

    train_correct = 0
    for z in range(len(cor)):
        a = np.argmax(ans[z])

        if a == cor[z]:
            train_correct += 1

    print('\tTraining data fit: %f' % (train_correct / sum(train_inds_array ==
                                                          1)))

    # Evaluate the fit of the test data
    test_correct = 0
    for z in test_inds:
        ans = clf.predict([array[z, :]])
        # print('Predicted: %d' % ans)
        # print('Actual: %d' % y[z])

        if ans == y[z]:
            test_correct += 1

    pcorrect = test_correct / len(test_inds)
    print('\tTest data fit: %f' % pcorrect)

    # # Export decision tree graph
    # if export:
    #     dot_data = StringIO()
    #     export_graphviz(clf, out_file=dot_data)
    #     graph = pydot.graph_from_dot_data(dot_data.getvalue())
    #     graph.write_pdf("Figures/decisiontree.pdf")

    return clf


def random_forest(data, inds_1, inds_2, num_test, n_estimators, max_depth):

    array = data.copy()
    num_samps = array.shape[0]
    # num_test =
    test_inds = np.arange(0, num_samps)
    np.random.shuffle(test_inds)
    train_inds = test_inds[num_test:]
    test_inds = test_inds[:num_test]
    train_inds_array = np.zeros(num_samps, dtype=np.int)
    train_inds_array[train_inds] = 1

    # Create labels
    y = np.zeros(shape=(array.shape[0]), dtype=np.int)
    y[inds_2] = 1

    # Set up random forest classifier
    clf = RandomForestClassifier(n_estimators=n_estimators,
                                 max_depth=max_depth,
                                 n_jobs=-1)
    clf.fit(array[train_inds_array == 1], y[train_inds_array == 1])

    # Evaluate the fit of the training data
    ans = clf.predict_proba(array[train_inds_array == 1])
    cor = y[train_inds_array == 1]

    train_correct = 0
    for z in range(len(cor)):
        a = np.argmax(ans[z])

        if a == cor[z]:
            train_correct += 1

    print('\tTraining data fit: %f' %
          (train_correct / sum(train_inds_array == 1)))

    # Evaluate the fit of the test data
    test_correct = 0
    for z in test_inds:
        ans = clf.predict([array[z, :]])
        # print('Predicted: %d' % ans)
        # print('Actual: %d' % y[z])

        if ans == y[z]:
            test_correct += 1

    pcorrect = test_correct / len(test_inds)
    print('\tTest data fit: %f' % pcorrect)

    return clf


def gb_random_forest(data, inds_1, inds_2, num_test, n_estimators, max_depth):

    array = data.copy()
    num_samps = array.shape[0]
    # num_test =
    test_inds = np.arange(0, num_samps)
    np.random.shuffle(test_inds)
    train_inds = test_inds[num_test:]
    test_inds = test_inds[:num_test]
    train_inds_array = np.zeros(num_samps, dtype=np.int)
    train_inds_array[train_inds] = 1

    # Create labels
    y = np.zeros(shape=(array.shape[0]), dtype=np.int)
    y[inds_2] = 1

    # Set up random forest classifier
    clf = GradientBoostingClassifier(n_estimators=n_estimators,
                                     max_depth=max_depth,
                                     learning_rate=1.0)
    clf.fit(array[train_inds_array == 1], y[train_inds_array == 1])

    # Evaluate the fit of the training data
    ans = clf.predict_proba(array[train_inds_array == 1])
    cor = y[train_inds_array == 1]

    train_correct = 0
    for z in range(len(cor)):
        a = np.argmax(ans[z])

        if a == cor[z]:
            train_correct += 1

    print('\tTraining data fit: %f' %
          (train_correct / sum(train_inds_array == 1)))

    # Evaluate the fit of the test data
    test_correct = 0
    for z in test_inds:
        ans = clf.predict([array[z, :]])
        # print('Predicted: %d' % ans)
        # print('Actual: %d' % y[z])

        if ans == y[z]:
            test_correct += 1

    pcorrect = test_correct / len(test_inds)
    print('\tTest data fit: %f' % pcorrect)

    return clf


def adb_random_forest(data, inds_1, inds_2, num_test, n_estimators, max_depth):

    array = data.copy()
    num_samps = array.shape[0]
    # num_test =
    test_inds = np.arange(0, num_samps)
    np.random.shuffle(test_inds)
    train_inds = test_inds[num_test:]
    test_inds = test_inds[:num_test]
    train_inds_array = np.zeros(num_samps, dtype=np.int)
    train_inds_array[train_inds] = 1

    # Create labels
    y = np.zeros(shape=(array.shape[0]), dtype=np.int)
    y[inds_2] = 1

    # Set up random forest classifier
    clf = AdaBoostClassifier(n_estimators=n_estimators,
                                     max_depth=max_depth,
                                     learning_rate=1.0)
    clf.fit(array[train_inds_array == 1], y[train_inds_array == 1])

    # Evaluate the fit of the training data
    ans = clf.predict_proba(array[train_inds_array == 1])
    cor = y[train_inds_array == 1]

    train_correct = 0
    for z in range(len(cor)):
        a = np.argmax(ans[z])

        if a == cor[z]:
            train_correct += 1

    print('\tTraining data fit: %f' %
          (train_correct / sum(train_inds_array == 1)))

    # Evaluate the fit of the test data
    test_correct = 0
    for z in test_inds:
        ans = clf.predict([array[z, :]])
        # print('Predicted: %d' % ans)
        # print('Actual: %d' % y[z])

        if ans == y[z]:
            test_correct += 1

    pcorrect = test_correct / len(test_inds)
    print('\tTest data fit: %f' % pcorrect)

    return clf


def local_linear_embedding(data, n_components=2, n_neighbors=5):

    from sklearn.manifold import LocallyLinearEmbedding, locally_linear_embedding
    # lle = LocallyLinearEmbedding(n_components=n_components,
    #                              n_neighbors=n_neighbors)

    [lle, er] = locally_linear_embedding(data, n_neighbors=n_neighbors, n_components=n_components, method='modified')

    return lle


def plot_euclidean_distances(rad_data, inds_1, inds_2):
    # Convert to numpy array
    if type(rad_data) != type(np.array([])):
        array = rad_data.get_values()
    else:
        array = rad_data.copy()

    similarities = euclidean_distances(array)

    similarities = np.nan_to_num(similarities)

    seed = np.random.RandomState(seed=3)

    mds = manifold.MDS(n_components=2, max_iter=5000, eps=1e-12,
                       random_state=seed,
                       n_init=2,
                       dissimilarity="precomputed", n_jobs=1, metric=False)
    pos = mds.fit_transform(similarities)

    from matplotlib.collections import LineCollection
    import matplotlib.cm as cm

    fig = plt.figure(1)
    ax = plt.axes([0., 0., 1., 1.])

    s = 100

    plt.scatter(pos[inds_1, 0], pos[inds_1, 1], color='navy', alpha=1.0, s=s,
                lw=5,
                label='0Gy')
    plt.scatter(pos[inds_2, 0], pos[inds_2, 1], color='turquoise', alpha=1.0,
                s=s,
                lw=2, label='20Gy')

    plt.legend(scatterpoints=1, loc=1, shadow=False)

    # plt.show()

    return fig


def pca_analyis(data, threshold=0.95):

    n_pcomps = min(data.shape)

    pca = PCA(n_components=n_pcomps)
    pca_dat = pca.fit(data.T)

    # Get components
    # comps = pca_dat.components_
    # print(len(comps))

    # Variance of the data along eac principal dimension
    cumsum = np.cumsum(pca_dat.explained_variance_ratio_)

    # Find the number of components to keep
    d = np.argmax(cumsum >= threshold)
    print('Use %d principal components to preserve %0.2f of the data '
          'variance' % (d, threshold))

    fig = plt.figure()

    line1, = plt.plot(list(range(0, n_pcomps + 1)), [0] + list(cumsum), '-*',
                      label='CT - 0Gy')
    line2, = plt.plot([0, n_pcomps+1], [threshold, threshold], '--k')
    line3, = plt.plot([d, d], [0, 1], '--k')
    line4, = plt.plot(d, threshold, '.k')

    plt.xlabel('Number of principal components')
    plt.title('Cumulative sum of variance')
    plt.ylim([0, 1])
    plt.xlim([0, n_pcomps])
    # plt.legend(loc=4)
    # plt.savefig('Figures/Cumulative Variance')
    # plt.show()

    return d, fig


def pca_reduction(data, npcomps):

    pca = PCA(n_components=npcomps)
    # pca = KernelPCA(n_components=npcomps)
    return pca.fit_transform(data)


def heatmap(data, inds_1, inds_2):

    train_inds_array = np.zeros(data.shape[1], dtype=np.int)
    train_inds_array[inds_1] = 1
    set1 = data.iloc[inds_1].corr()

    f, ax = plt.subplots(figsize=(15, 10))
    # Draw the heatmap using seaborn
    sns.heatmap(set1, vmax=.8, square=True)
    plt.title('Set1')
    plt.yticks(rotation=0)
    ax.get_xaxis().set_ticks([])
    plt.xticks(rotation=90)

    plt.show()

    set2 = data.iloc[inds_2].corr()

    f, ax = plt.subplots(figsize=(15, 10))
    # Draw the heatmap using seaborn
    sns.heatmap(set2, vmax=.8, square=True)
    plt.title('Set2')
    plt.yticks(rotation=0)
    ax.get_xaxis().set_ticks([])
    plt.xticks(rotation=90)

    plt.show()

    set3 = set1 - set2

    f, ax = plt.subplots(figsize=(15, 10))
    # Draw the heatmap using seaborn
    sns.heatmap(set3, vmax=.8, square=True)
    plt.title('Diff')
    plt.yticks(rotation=0)
    ax.get_xaxis().set_ticks([])
    plt.xticks(rotation=90)

    plt.show()


def concate_contrasts(data):
    """
    Takes a dataframe containing radiomcis data for all animals and contrasta.
    Returns a dataframe with contrasts of the same animal concatenated together.
    Args:
        data:

    Returns:

    """

    # Get file names and before/after classification
    filenames = data['Image']
    inds_1 = [i for (i, name) in enumerate(filenames) if '_1.nii' in name]
    inds_2 = [i for (i, name) in enumerate(filenames) if '_2.nii' in name]

    # Turn multiple contrast images into a single vector of data
    unique_ids = np.unique(data['ID'])
    keys = data.keys()
    cat_cols = [i + '_T1' for i in keys] + \
               [i + '_T1C' for i in keys] + \
               [i + '_T2' for i in keys]
    df_cat = pd.DataFrame(columns=cat_cols, index=range(2*len(unique_ids)))

    ind = 0
    for (i, id) in enumerate(unique_ids):

        tmp = data[data['ID'] == id]

        # Make pre RT data
        tmp_pre = tmp[tmp.index < min(inds_2)]
        tmp_post = tmp[tmp.index >= min(inds_2)]

        # Append data into larger array
        df_cat.iloc[ind] = [n for n in tmp_pre.get_values().ravel()]
        ind += 1

        df_cat.iloc[ind] = [n for n in tmp_post.get_values().ravel()]
        ind += 1

    return df_cat


def nn_classifier():
    hidden_layer_sizes = (50, 25, )
    clf = MLPClassifier(solver='sgd', max_iter=5000, verbose=0, tol=1e-4, alpha=1e-4,
                        hidden_layer_sizes=hidden_layer_sizes, learning_rate_init=0.001, random_state=RAND_STATE_CL)
    return clf


def svm_classifier():
    return SVC(kernel='rbf', gamma=0.01, C=5, probability=True, random_state=RAND_STATE_CL, verbose=0)


def rnd_forest_classifier():

    clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=4, verbose=0)

    params = {'clf__max_depth': [2, 3, 4],
              'clf__n_estmiators': [6, 8, 10, 12]}

    return clf


def lasso_classifier():

    from sklearn.linear_model import Lasso

    clf = Lasso()

    return clf


def RBM_classifier():

    from sklearn.neural_network import BernoulliRBM

    clf = BernoulliRBM()

    return clf


def classifier(x, y, clfer):

    plt.close('all')
    fig = plt.figure() #figsize = (20, 15))

    sz = x.shape

    cv = StratifiedKFold(n_splits=NSPLITS, random_state=RAND_STATE)

    clf = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', clfer)])

    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    for train, test in cv.split(x, y):

        # print('Size of train set: %d\t\tSize of test set: %d\n' %(len(train), len(test)))

        probas_ = clf.fit(x[train], y[train]).predict_proba(x[test])

        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)

        # print("Training set score: %f" % clf.score(x[train], y[train]))
        # print("Test set score: %f\n\n" % clf.score(x[test], y[test]))

    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
             label='Chance', alpha=.8)

    mean_tpr = np.nanmean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.nanstd(aucs)
    plt.plot(mean_fpr, mean_tpr, color='b',
             label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
             lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                     label=r'$\pm$ 1 std. dev.')

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")

    # Perform a grid search
    # net = [(50, 25, 1),
    #        (75, 50, 25, 1),
    #        (10, 25, 10, 1),
    #        (100, 100, 1),
    #        (100, 150, 1),
    #        (100, 50, 1),
    #        (50, 100, 50, 1),
    #        (25, 50, 25, 10, 1),
    #        (10, 20, 20, 10, 1)]
    #
    # params = {'clf__hidden_layer_sizes': net, #[(100, 100, 1), (50, 50, 1), (25, 25, 1), (10, 10, 1)],
    #           # 'nn_clf__learning_rate': ['adaptive', 'invscaling'],
    #           'clf__learning_rate_init': [1e-4, 1e-3, 1e-2]}
    # grid_search(clf, x, y, cv, params)

    print('Mean AUC: {:0.2f}'.format(mean_auc))
    print('Std  AUC: {:0.2f}'.format(std_auc))

    return fig, mean_auc, std_auc