--- a +++ b/Radiomics/radiomic_functions.py @@ -0,0 +1,963 @@ +import numpy as np +import pandas as pd +from scipy import interp +import matplotlib.pyplot as plt +from sklearn import manifold +from sklearn.metrics import euclidean_distances, roc_curve, roc_auc_score, auc +from sklearn.model_selection import StratifiedKFold +from sklearn.decomposition import PCA, KernelPCA +from sklearn.tree import DecisionTreeClassifier +from sklearn.preprocessing import PolynomialFeatures, StandardScaler +from sklearn.svm import SVC +from sklearn.pipeline import Pipeline +from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.externals.six import StringIO +from sklearn.neural_network import MLPClassifier +from sklearn.model_selection import KFold, ShuffleSplit, GridSearchCV +from sklearn import preprocessing +import seaborn as sns + +# Set up plotting properties +sns.set_style('whitegrid') +sns.set_context("paper") + +RAND_STATE = 1 +RAND_STATE_CL = 42 +NSPLITS = 5 + + +def scale_features(data): + """ + Scale features by their mean and std + Args: + data (pandas dataframe): dataframe containing only the numeric + radiomic values + + Returns: + + """ + + df = data.copy() + for key in data.keys(): + + d = data[key] + + d = (d - d.mean()) / d.std() + + df[key] = d + + return df + + +def bar_plot(data): + + plt.figure(figsize=(25, 10)), data.iloc[0].plot.bar(), plt.show() + + +def svm_cross(x, y): + + # Attempt an SVM classifier + plt.close('all') + fig = plt.figure() + + cv = StratifiedKFold(n_splits=NSPLITS, random_state=RAND_STATE) + + clfsvm = SVC(kernel='rbf', gamma=0.01, C=5, probability=True, random_state=RAND_STATE_CL, verbose=0) + + clf = Pipeline([ + ('scaler', StandardScaler()), + ('svm_clf', clfsvm) + ]) + + tprs = [] + aucs = [] + mean_fpr = np.linspace(0, 1, 100) + + # Grid search + param = [ + { + "svm_clf__kernel": ["sigmoid", "rbf"], + "svm_clf__C": [0.05, 0.1, 0.5, 0.75, 1, 10, 100, 1000], + "svm_clf__gamma": [5, 1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5] + } + ] + + grid = GridSearchCV(clf, param_grid=param, n_jobs=4, cv=cv, verbose=0, iid=False) + grid.fit(x, y) + + print("\nBest parameters set: ") + print(grid.best_params_) + + # Assign best estimator + clf = grid.best_estimator_ + + i = 0 + train_score = [] + test_score = [] + for train, test in cv.split(x, y): + + print('Size of train set: %d\t\tSize of test set: %d\n' %(len(train), len(test))) + + # Compute probabilities + probas_ = clf.fit(x[train], y[train]).predict_proba(x[test]) + + # Compute ROC curve and area the curve + fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) + tprs.append(interp(mean_fpr, fpr, tpr)) + tprs[-1][0] = 0.0 + roc_auc = auc(fpr, tpr) + aucs.append(roc_auc) + + train_score.append(clf.score(x[train], y[train])) + test_score.append(clf.score(x[test], y[test])) + print("Training set score: %f" % train_score[i]) + print("Test set score: %f\n\n" % test_score[i]) + + i += 1 + + print('Mean training score %f' % np.mean(train_score)) + print('Mean testing score %f' % np.mean(test_score)) + + plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', + label='Chance', alpha=.8) + + mean_tpr = np.mean(tprs, axis=0) + mean_tpr[-1] = 1.0 + mean_auc = auc(mean_fpr, mean_tpr) + std_auc = np.std(aucs) + plt.plot(mean_fpr, mean_tpr, color='b', + label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), + lw=2, alpha=.8) + + std_tpr = np.std(tprs, axis=0) + tprs_upper = np.minimum(mean_tpr + std_tpr, 1) + tprs_lower = np.maximum(mean_tpr - std_tpr, 0) + plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, + label=r'$\pm$ 1 std. dev.') + + plt.xlim([-0.05, 1.05]) + plt.ylim([-0.05, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.legend(loc="lower right") + + return fig #plt.savefig(sname, dpi=300) + + +def grid_search(clf, x, y, cv, params): + + net = [] + for layers in range(2, 4): + + for nodes in range(20, 110, 20): + cur = [] + for n in range(layers - 1): + + cur.append(nodes) + + cur.append(1) + + net.append(cur) + + # net = [(50, 25, 1), + # (75, 50, 25, 1), + # (10, 25, 10, 1), + # (100, 100, 1), + # (100, 150, 1), + # (100, 50, 1), + # (50, 100, 50, 1), + # (25, 50, 25, 10, 1), + # (10, 20, 20, 10, 1)] + # + # params = {'nn_clf__hidden_layer_sizes': net, #[(100, 100, 1), (50, 50, 1), (25, 25, 1), (10, 10, 1)], + # # 'nn_clf__learning_rate': ['adaptive', 'invscaling'], + # 'nn_clf__learning_rate_init': [1e-4, 1e-3, 1e-2]} + + from sklearn.model_selection import GridSearchCV + from sklearn.metrics import f1_score, make_scorer, roc_auc_score + scorer = make_scorer(roc_auc_score) + # Define grid search + search = GridSearchCV(clf, params, cv=cv, scoring=scorer, n_jobs=-1) + + # Fit + search.fit(x, y) + + cvres = search.cv_results_ + for mean_score, params in zip(cvres["mean_test_score"], cvres['params']): + print(np.sqrt(mean_score), params) + + print('Best params: ', search.best_params_) + print('Best score: ', search.best_score_) + + +def neural_network_cross(x, y): + + plt.close('all') + fig = plt.figure() #figsize = (20, 15)) + + sz = x.shape + + cv = StratifiedKFold(n_splits=NSPLITS, random_state=RAND_STATE) + + hidden_layer_sizes = (50, 25, ) + clfnn = MLPClassifier(solver='sgd', max_iter=5000, verbose=0, tol=1e-4, alpha=1e-4, + hidden_layer_sizes=hidden_layer_sizes, learning_rate_init=0.001, random_state=RAND_STATE_CL) + + clf = Pipeline([ + ('scaler', StandardScaler()), + ('nn_clf', clfnn) + ]) + + # Grid search + net = [(100, ), + (50, 25, )] + param = [ + { + "nn_clf__solver": ["sgd", "adam"], + "nn_clf__alpha": [1e-5, 1e-4, 1e-3], + "nn_clf__learning_rate_init": [1e-3, 1e-4], + "nn_clf__hidden_layer_sizes": net, + } + ] + + grid = GridSearchCV(clf, param_grid=param, n_jobs=4, cv=cv, verbose=0, iid=False) + grid.fit(x, y) + + print("\nBest parameters set: ") + print(grid.best_params_) + + # Assign best estimator + clf = grid.best_estimator_ + + tprs = [] + aucs = [] + mean_fpr = np.linspace(0, 1, 100) + + train_score = [] + test_score = [] + i = 0 + for train, test in cv.split(x, y): + + print('Size of train set: %d\t\tSize of test set: %d\n' %(len(train), len(test))) + + # Compute probablilities + probas_ = clf.fit(x[train], y[train]).predict_proba(x[test]) + + # Compute ROC curve and area the curve + fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) + tprs.append(interp(mean_fpr, fpr, tpr)) + tprs[-1][0] = 0.0 + roc_auc = auc(fpr, tpr) + aucs.append(roc_auc) + + # Get training scores + train_score.append(clf.score(x[train], y[train])) + test_score.append(clf.score(x[test], y[test])) + print("Training set score: %f" % train_score[i]) + print("Test set score: %f\n\n" % test_score[i]) + + i += 1 + + print('Mean training score %f' % np.mean(train_score)) + print('Mean testing score %f' % np.mean(test_score)) + + plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', + label='Chance', alpha=.8) + + mean_tpr = np.nanmean(tprs, axis=0) + mean_tpr[-1] = 1.0 + mean_auc = auc(mean_fpr, mean_tpr) + std_auc = np.nanstd(aucs) + plt.plot(mean_fpr, mean_tpr, color='b', + label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), + lw=2, alpha=.8) + + std_tpr = np.std(tprs, axis=0) + tprs_upper = np.minimum(mean_tpr + std_tpr, 1) + tprs_lower = np.maximum(mean_tpr - std_tpr, 0) + plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, + label=r'$\pm$ 1 std. dev.') + + plt.xlim([-0.05, 1.05]) + plt.ylim([-0.05, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.legend(loc="lower right") + + # Perform a grid search + # net = [(50, 25, 1), + # (75, 50, 25, 1), + # (10, 25, 10, 1), + # (100, 100, 1), + # (100, 150, 1), + # (100, 50, 1), + # (50, 100, 50, 1), + # (25, 50, 25, 10, 1), + # (10, 20, 20, 10, 1)] + # + # params = {'nn_clf__hidden_layer_sizes': net, #[(100, 100, 1), (50, 50, 1), (25, 25, 1), (10, 10, 1)], + # # 'nn_clf__learning_rate': ['adaptive', 'invscaling'], + # 'nn_clf__learning_rate_init': [1e-4, 1e-3, 1e-2]} + # grid_search(clf, x, y, cv, params) + + return fig + + +def svm_classifier(data, inds_1, inds_2): + + # Convert pandas df to array + np.random.seed(42) + array = data.copy() + num_samps = array.shape[0] + num_test = int(len(inds_1) * 0.25) + test_inds = np.arange(0, num_samps) + np.random.shuffle(test_inds) + train_inds = test_inds[num_test:] + test_inds = test_inds[:num_test] + train_inds_array = np.zeros(num_samps, dtype=np.int) + train_inds_array[train_inds] = 1 + + # Create labels + y = np.zeros(shape=(array.shape[0])) + y[inds_2] = 1 + + print('Test distribution: True: %d and False: %d' %(np.sum(y[test_inds]), len(test_inds) - np.sum(y[test_inds]))) + + # Attempt an SVM classifier + from sklearn.svm import SVC + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import PolynomialFeatures, StandardScaler + + # poly_kernel_svm_clf = SVC() + poly_kernel_svm_clf = Pipeline([ + ('scaler', StandardScaler()), + ('svm_clf', SVC(kernel='rbf', probability=True, random_state=RAND_STATE_CL)) + ]) + poly_kernel_svm_clf.fit(array[train_inds_array == 1], y[train_inds_array==1]) + + correct = 0 + for z in train_inds: + ans = poly_kernel_svm_clf.predict([array[z, :]]) + # print('Predicted: %d' % ans) + # print('Actual: %d' % y[z]) + + if ans == y[z]: + correct += 1 + + pcorrect = correct / len(train_inds) + print('\tTraining data fit: %f' % pcorrect) + + correct = 0 + for z in test_inds: + ans = poly_kernel_svm_clf.predict([array[z, :]]) + # print('Predicted: %d' % ans) + # print('Actual: %d' % y[z]) + + if ans == y[z]: + correct += 1 + + pcorrect = correct / len(test_inds) + print('\tTest data fit: %f' % pcorrect) + + # Compute probabilities + # Run classifier with cross-validation and plot ROC curves + plt.close('all') + fig = plt.figure(figsize = (20, 15)) + + cv = StratifiedKFold(n_splits=6) + + tprs = [] + aucs = [] + mean_fpr = np.linspace(0, 1, 100) + + i = 0 + for train, test in cv.split(array, y): + probas_ = poly_kernel_svm_clf.fit(array[train], y[train]).predict_proba(array[test]) + # Compute ROC curve and area the curve + fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) + tprs.append(interp(mean_fpr, fpr, tpr)) + tprs[-1][0] = 0.0 + roc_auc = auc(fpr, tpr) + aucs.append(roc_auc) + plt.plot(fpr, tpr, lw=1, alpha=0.3, + label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc)) + + i += 1 + plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', + label='Chance', alpha=.8) + + mean_tpr = np.mean(tprs, axis=0) + mean_tpr[-1] = 1.0 + mean_auc = auc(mean_fpr, mean_tpr) + std_auc = np.std(aucs) + plt.plot(mean_fpr, mean_tpr, color='b', + label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), + lw=2, alpha=.8) + + std_tpr = np.std(tprs, axis=0) + tprs_upper = np.minimum(mean_tpr + std_tpr, 1) + tprs_lower = np.maximum(mean_tpr - std_tpr, 0) + plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, + label=r'$\pm$ 1 std. dev.') + + plt.xlim([-0.05, 1.05]) + plt.ylim([-0.05, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('Receiver operating characteristic example') + plt.legend(loc="lower right") + # plt.savefig('roc.png', dpi=300) + # plt.show() + # fig.close() + + # Plot SVM + if data.shape[1] == 2: + # from sklearn.decomposition import PCA + # pca = PCA(n_components=2).fit(array) + # X = pca.transform(array) + X = data.copy() + + # CT + fig = plt.figure(21) + plt.clf() + plt.scatter(X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.Paired, + edgecolor='k', s=20) + + # Circle out the test data + # plt.scatter(lle_CT[:, 0], lle_CT[:, 1], s=80, facecolors='none', + # zorder=10, edgecolor='k') + + plt.axis('tight') + x_min = X[:, 0].min() + x_max = X[:, 0].max() + y_min = X[:, 1].min() + y_max = X[:, 1].max() + XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j] + Z = poly_kernel_svm_clf.decision_function(np.c_[XX.ravel(), YY.ravel()]) + + # Put the result into a color plot + Z = Z.reshape(XX.shape) + plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired) + plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], + linestyles=['--', '-', '--'], levels=[-.5, 0, .5]) + plt.title('CT: SVM on 2 Principal Components') + # plt.savefig('Figures/LLE_CT_SVM_classifier.png') + # plt.show() + + else: + fig = [] + + return fig, poly_kernel_svm_clf + + +def decision_tree_classifier(data, inds_1, inds_2, num_test, max_depth, export): + + # Convert pandas df to array + array = data.copy() + num_samps = array.shape[0] + # num_test = + test_inds = np.arange(0, num_samps) + np.random.shuffle(test_inds) + train_inds = test_inds[num_test:] + test_inds = test_inds[:num_test] + train_inds_array = np.zeros(num_samps, dtype=np.int) + train_inds_array[train_inds] = 1 + + # Create labels + y = np.zeros(shape=(array.shape[0]), dtype=np.int) + y[inds_2] = 1 + + # Set up tree classifier + clf = DecisionTreeClassifier(max_depth=max_depth) + clf.fit(array[train_inds_array == 1], y[train_inds_array == 1]) + + # Evaluate the fit of the training data + ans = clf.predict_proba(array[train_inds_array == 1]) + cor = y[train_inds_array == 1] + + train_correct = 0 + for z in range(len(cor)): + a = np.argmax(ans[z]) + + if a == cor[z]: + train_correct += 1 + + print('\tTraining data fit: %f' % (train_correct / sum(train_inds_array == + 1))) + + # Evaluate the fit of the test data + test_correct = 0 + for z in test_inds: + ans = clf.predict([array[z, :]]) + # print('Predicted: %d' % ans) + # print('Actual: %d' % y[z]) + + if ans == y[z]: + test_correct += 1 + + pcorrect = test_correct / len(test_inds) + print('\tTest data fit: %f' % pcorrect) + + # # Export decision tree graph + # if export: + # dot_data = StringIO() + # export_graphviz(clf, out_file=dot_data) + # graph = pydot.graph_from_dot_data(dot_data.getvalue()) + # graph.write_pdf("Figures/decisiontree.pdf") + + return clf + + +def random_forest(data, inds_1, inds_2, num_test, n_estimators, max_depth): + + array = data.copy() + num_samps = array.shape[0] + # num_test = + test_inds = np.arange(0, num_samps) + np.random.shuffle(test_inds) + train_inds = test_inds[num_test:] + test_inds = test_inds[:num_test] + train_inds_array = np.zeros(num_samps, dtype=np.int) + train_inds_array[train_inds] = 1 + + # Create labels + y = np.zeros(shape=(array.shape[0]), dtype=np.int) + y[inds_2] = 1 + + # Set up random forest classifier + clf = RandomForestClassifier(n_estimators=n_estimators, + max_depth=max_depth, + n_jobs=-1) + clf.fit(array[train_inds_array == 1], y[train_inds_array == 1]) + + # Evaluate the fit of the training data + ans = clf.predict_proba(array[train_inds_array == 1]) + cor = y[train_inds_array == 1] + + train_correct = 0 + for z in range(len(cor)): + a = np.argmax(ans[z]) + + if a == cor[z]: + train_correct += 1 + + print('\tTraining data fit: %f' % + (train_correct / sum(train_inds_array == 1))) + + # Evaluate the fit of the test data + test_correct = 0 + for z in test_inds: + ans = clf.predict([array[z, :]]) + # print('Predicted: %d' % ans) + # print('Actual: %d' % y[z]) + + if ans == y[z]: + test_correct += 1 + + pcorrect = test_correct / len(test_inds) + print('\tTest data fit: %f' % pcorrect) + + return clf + + +def gb_random_forest(data, inds_1, inds_2, num_test, n_estimators, max_depth): + + array = data.copy() + num_samps = array.shape[0] + # num_test = + test_inds = np.arange(0, num_samps) + np.random.shuffle(test_inds) + train_inds = test_inds[num_test:] + test_inds = test_inds[:num_test] + train_inds_array = np.zeros(num_samps, dtype=np.int) + train_inds_array[train_inds] = 1 + + # Create labels + y = np.zeros(shape=(array.shape[0]), dtype=np.int) + y[inds_2] = 1 + + # Set up random forest classifier + clf = GradientBoostingClassifier(n_estimators=n_estimators, + max_depth=max_depth, + learning_rate=1.0) + clf.fit(array[train_inds_array == 1], y[train_inds_array == 1]) + + # Evaluate the fit of the training data + ans = clf.predict_proba(array[train_inds_array == 1]) + cor = y[train_inds_array == 1] + + train_correct = 0 + for z in range(len(cor)): + a = np.argmax(ans[z]) + + if a == cor[z]: + train_correct += 1 + + print('\tTraining data fit: %f' % + (train_correct / sum(train_inds_array == 1))) + + # Evaluate the fit of the test data + test_correct = 0 + for z in test_inds: + ans = clf.predict([array[z, :]]) + # print('Predicted: %d' % ans) + # print('Actual: %d' % y[z]) + + if ans == y[z]: + test_correct += 1 + + pcorrect = test_correct / len(test_inds) + print('\tTest data fit: %f' % pcorrect) + + return clf + + +def adb_random_forest(data, inds_1, inds_2, num_test, n_estimators, max_depth): + + array = data.copy() + num_samps = array.shape[0] + # num_test = + test_inds = np.arange(0, num_samps) + np.random.shuffle(test_inds) + train_inds = test_inds[num_test:] + test_inds = test_inds[:num_test] + train_inds_array = np.zeros(num_samps, dtype=np.int) + train_inds_array[train_inds] = 1 + + # Create labels + y = np.zeros(shape=(array.shape[0]), dtype=np.int) + y[inds_2] = 1 + + # Set up random forest classifier + clf = AdaBoostClassifier(n_estimators=n_estimators, + max_depth=max_depth, + learning_rate=1.0) + clf.fit(array[train_inds_array == 1], y[train_inds_array == 1]) + + # Evaluate the fit of the training data + ans = clf.predict_proba(array[train_inds_array == 1]) + cor = y[train_inds_array == 1] + + train_correct = 0 + for z in range(len(cor)): + a = np.argmax(ans[z]) + + if a == cor[z]: + train_correct += 1 + + print('\tTraining data fit: %f' % + (train_correct / sum(train_inds_array == 1))) + + # Evaluate the fit of the test data + test_correct = 0 + for z in test_inds: + ans = clf.predict([array[z, :]]) + # print('Predicted: %d' % ans) + # print('Actual: %d' % y[z]) + + if ans == y[z]: + test_correct += 1 + + pcorrect = test_correct / len(test_inds) + print('\tTest data fit: %f' % pcorrect) + + return clf + + +def local_linear_embedding(data, n_components=2, n_neighbors=5): + + from sklearn.manifold import LocallyLinearEmbedding, locally_linear_embedding + # lle = LocallyLinearEmbedding(n_components=n_components, + # n_neighbors=n_neighbors) + + [lle, er] = locally_linear_embedding(data, n_neighbors=n_neighbors, n_components=n_components, method='modified') + + return lle + + +def plot_euclidean_distances(rad_data, inds_1, inds_2): + # Convert to numpy array + if type(rad_data) != type(np.array([])): + array = rad_data.get_values() + else: + array = rad_data.copy() + + similarities = euclidean_distances(array) + + similarities = np.nan_to_num(similarities) + + seed = np.random.RandomState(seed=3) + + mds = manifold.MDS(n_components=2, max_iter=5000, eps=1e-12, + random_state=seed, + n_init=2, + dissimilarity="precomputed", n_jobs=1, metric=False) + pos = mds.fit_transform(similarities) + + from matplotlib.collections import LineCollection + import matplotlib.cm as cm + + fig = plt.figure(1) + ax = plt.axes([0., 0., 1., 1.]) + + s = 100 + + plt.scatter(pos[inds_1, 0], pos[inds_1, 1], color='navy', alpha=1.0, s=s, + lw=5, + label='0Gy') + plt.scatter(pos[inds_2, 0], pos[inds_2, 1], color='turquoise', alpha=1.0, + s=s, + lw=2, label='20Gy') + + plt.legend(scatterpoints=1, loc=1, shadow=False) + + # plt.show() + + return fig + + +def pca_analyis(data, threshold=0.95): + + n_pcomps = min(data.shape) + + pca = PCA(n_components=n_pcomps) + pca_dat = pca.fit(data.T) + + # Get components + # comps = pca_dat.components_ + # print(len(comps)) + + # Variance of the data along eac principal dimension + cumsum = np.cumsum(pca_dat.explained_variance_ratio_) + + # Find the number of components to keep + d = np.argmax(cumsum >= threshold) + print('Use %d principal components to preserve %0.2f of the data ' + 'variance' % (d, threshold)) + + fig = plt.figure() + + line1, = plt.plot(list(range(0, n_pcomps + 1)), [0] + list(cumsum), '-*', + label='CT - 0Gy') + line2, = plt.plot([0, n_pcomps+1], [threshold, threshold], '--k') + line3, = plt.plot([d, d], [0, 1], '--k') + line4, = plt.plot(d, threshold, '.k') + + plt.xlabel('Number of principal components') + plt.title('Cumulative sum of variance') + plt.ylim([0, 1]) + plt.xlim([0, n_pcomps]) + # plt.legend(loc=4) + # plt.savefig('Figures/Cumulative Variance') + # plt.show() + + return d, fig + + +def pca_reduction(data, npcomps): + + pca = PCA(n_components=npcomps) + # pca = KernelPCA(n_components=npcomps) + return pca.fit_transform(data) + + +def heatmap(data, inds_1, inds_2): + + train_inds_array = np.zeros(data.shape[1], dtype=np.int) + train_inds_array[inds_1] = 1 + set1 = data.iloc[inds_1].corr() + + f, ax = plt.subplots(figsize=(15, 10)) + # Draw the heatmap using seaborn + sns.heatmap(set1, vmax=.8, square=True) + plt.title('Set1') + plt.yticks(rotation=0) + ax.get_xaxis().set_ticks([]) + plt.xticks(rotation=90) + + plt.show() + + set2 = data.iloc[inds_2].corr() + + f, ax = plt.subplots(figsize=(15, 10)) + # Draw the heatmap using seaborn + sns.heatmap(set2, vmax=.8, square=True) + plt.title('Set2') + plt.yticks(rotation=0) + ax.get_xaxis().set_ticks([]) + plt.xticks(rotation=90) + + plt.show() + + set3 = set1 - set2 + + f, ax = plt.subplots(figsize=(15, 10)) + # Draw the heatmap using seaborn + sns.heatmap(set3, vmax=.8, square=True) + plt.title('Diff') + plt.yticks(rotation=0) + ax.get_xaxis().set_ticks([]) + plt.xticks(rotation=90) + + plt.show() + + +def concate_contrasts(data): + """ + Takes a dataframe containing radiomcis data for all animals and contrasta. + Returns a dataframe with contrasts of the same animal concatenated together. + Args: + data: + + Returns: + + """ + + # Get file names and before/after classification + filenames = data['Image'] + inds_1 = [i for (i, name) in enumerate(filenames) if '_1.nii' in name] + inds_2 = [i for (i, name) in enumerate(filenames) if '_2.nii' in name] + + # Turn multiple contrast images into a single vector of data + unique_ids = np.unique(data['ID']) + keys = data.keys() + cat_cols = [i + '_T1' for i in keys] + \ + [i + '_T1C' for i in keys] + \ + [i + '_T2' for i in keys] + df_cat = pd.DataFrame(columns=cat_cols, index=range(2*len(unique_ids))) + + ind = 0 + for (i, id) in enumerate(unique_ids): + + tmp = data[data['ID'] == id] + + # Make pre RT data + tmp_pre = tmp[tmp.index < min(inds_2)] + tmp_post = tmp[tmp.index >= min(inds_2)] + + # Append data into larger array + df_cat.iloc[ind] = [n for n in tmp_pre.get_values().ravel()] + ind += 1 + + df_cat.iloc[ind] = [n for n in tmp_post.get_values().ravel()] + ind += 1 + + return df_cat + + +def nn_classifier(): + hidden_layer_sizes = (50, 25, ) + clf = MLPClassifier(solver='sgd', max_iter=5000, verbose=0, tol=1e-4, alpha=1e-4, + hidden_layer_sizes=hidden_layer_sizes, learning_rate_init=0.001, random_state=RAND_STATE_CL) + return clf + + +def svm_classifier(): + return SVC(kernel='rbf', gamma=0.01, C=5, probability=True, random_state=RAND_STATE_CL, verbose=0) + + +def rnd_forest_classifier(): + + clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=4, verbose=0) + + params = {'clf__max_depth': [2, 3, 4], + 'clf__n_estmiators': [6, 8, 10, 12]} + + return clf + + +def lasso_classifier(): + + from sklearn.linear_model import Lasso + + clf = Lasso() + + return clf + + +def RBM_classifier(): + + from sklearn.neural_network import BernoulliRBM + + clf = BernoulliRBM() + + return clf + + +def classifier(x, y, clfer): + + plt.close('all') + fig = plt.figure() #figsize = (20, 15)) + + sz = x.shape + + cv = StratifiedKFold(n_splits=NSPLITS, random_state=RAND_STATE) + + clf = Pipeline([ + ('scaler', StandardScaler()), + ('clf', clfer)]) + + tprs = [] + aucs = [] + mean_fpr = np.linspace(0, 1, 100) + + for train, test in cv.split(x, y): + + # print('Size of train set: %d\t\tSize of test set: %d\n' %(len(train), len(test))) + + probas_ = clf.fit(x[train], y[train]).predict_proba(x[test]) + + # Compute ROC curve and area the curve + fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) + tprs.append(interp(mean_fpr, fpr, tpr)) + tprs[-1][0] = 0.0 + roc_auc = auc(fpr, tpr) + aucs.append(roc_auc) + + # print("Training set score: %f" % clf.score(x[train], y[train])) + # print("Test set score: %f\n\n" % clf.score(x[test], y[test])) + + plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', + label='Chance', alpha=.8) + + mean_tpr = np.nanmean(tprs, axis=0) + mean_tpr[-1] = 1.0 + mean_auc = auc(mean_fpr, mean_tpr) + std_auc = np.nanstd(aucs) + plt.plot(mean_fpr, mean_tpr, color='b', + label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), + lw=2, alpha=.8) + + std_tpr = np.std(tprs, axis=0) + tprs_upper = np.minimum(mean_tpr + std_tpr, 1) + tprs_lower = np.maximum(mean_tpr - std_tpr, 0) + plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, + label=r'$\pm$ 1 std. dev.') + + plt.xlim([-0.05, 1.05]) + plt.ylim([-0.05, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.legend(loc="lower right") + + # Perform a grid search + # net = [(50, 25, 1), + # (75, 50, 25, 1), + # (10, 25, 10, 1), + # (100, 100, 1), + # (100, 150, 1), + # (100, 50, 1), + # (50, 100, 50, 1), + # (25, 50, 25, 10, 1), + # (10, 20, 20, 10, 1)] + # + # params = {'clf__hidden_layer_sizes': net, #[(100, 100, 1), (50, 50, 1), (25, 25, 1), (10, 10, 1)], + # # 'nn_clf__learning_rate': ['adaptive', 'invscaling'], + # 'clf__learning_rate_init': [1e-4, 1e-3, 1e-2]} + # grid_search(clf, x, y, cv, params) + + print('Mean AUC: {:0.2f}'.format(mean_auc)) + print('Std AUC: {:0.2f}'.format(std_auc)) + + return fig, mean_auc, std_auc \ No newline at end of file