Diff of /prostate_cancer.py [000000] .. [742cad]

Switch to side-by-side view

--- a
+++ b/prostate_cancer.py
@@ -0,0 +1,292 @@
+import pandas as pd
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import MinMaxScaler
+from collections import Counter
+
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
+from sklearn import svm                                  
+
+from sklearn.decomposition import PCA, KernelPCA
+from sklearn.ensemble import IsolationForest, RandomForestClassifier
+
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import KFold
+from sklearn.feature_selection import VarianceThreshold
+from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
+from mlxtend.feature_selection import SequentialFeatureSelector as SFS
+
+from sklearn.model_selection import cross_val_score
+from sklearn.model_selection import cross_val_predict
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import accuracy_score
+from sklearn.naive_bayes import GaussianNB
+from statistics import mean
+from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
+
+from imblearn.over_sampling import SMOTE
+from imblearn.combine import SMOTEENN
+
+# MODEL = OneVsRestClassifier(svm.SVC(kernel='linear'))
+MODEL_LDA = LinearDiscriminantAnalysis()
+MODEL_NB = GaussianNB()
+MODEL_RF = RandomForestClassifier()
+def load_genomic_data(filename):
+     # load the geneomic data
+    genomic_df = pd.read_csv(filename)
+
+    # Setting index to first column, else it will add its own indexing while doing transpose
+    genomic_df.set_index('ID', inplace = True)
+    
+    # Need to take transpose since I want genes/features to be columns and each row should represent a patient information
+    genomic_df = genomic_df.T
+    
+    # removing features with only zero values for all patients
+    return genomic_df.loc[:, (genomic_df != 0).any(axis = 0)]    
+    
+def read_data(gfilename, cfilename):
+    # Feature set, load geonomic data
+    X = load_genomic_data(gfilename)
+    
+    # load the clinical data
+    clinical_df = pd.read_csv(cfilename)
+    print("Shape of genomic data: ", X.shape, " and Shape of clinical data: ", clinical_df.shape, "thus looks like we donot have genetic data for 5 patients, hence removing them")
+    clinical_df = clinical_df.drop(labels=[213,227,297,371,469], axis=0)
+    print("After droping 5 patients whose data were missing:\nShape of genomic data: ", X.shape, " and Shape of clinical data: ", clinical_df.shape, "\n")
+    
+    print("-- Checking if all patient ID's in genetic data set and clinical dataset matches\n")
+    if(X.index.all() == clinical_df['PATIENT_ID'].all()):
+        print("-- Yes, patient ID's in genetic data set and clinical dataset matches\n")
+    else:
+        print("Nope, patient ID's in genetic data set and clinical dataset do not match")
+    
+    y =  clinical_df['GLEASON_SCORE']                   
+    
+    return X, y
+
+
+def visualize_data(X, y, title):
+    # Visualizing dataset for outliers, using PCA prioir to LDA to prevent overfitting (https://stats.stackexchange.com/q/109810)
+    pca = PCA(n_components=10)
+    pca_reduced_data = pca.fit_transform(X,y)
+    
+    lda = LinearDiscriminantAnalysis(n_components = 2)
+    pca_lda_reduced_data = lda.fit_transform(pca_reduced_data, y)
+    
+    # NOTE: Gleason score ranges from 6-10
+    label = [6, 7, 8, 9, 10]
+    colors = ['red','green','blue','purple','pink']
+    
+    fig = plt.figure(figsize=(6,6))
+    plt.scatter(pca_lda_reduced_data[:,0], pca_lda_reduced_data[:,1], c=y, cmap=matplotlib.colors.ListedColormap(colors), alpha=0.7)
+    plt.title(title)
+    
+    
+def prepare_inputs(X_train, X_test):
+    scaler = MinMaxScaler()
+    X_train_norm = scaler.fit_transform(X_train)
+    X_test_norm = scaler.transform(X_test)             
+    return X_train_norm, X_test_norm
+
+def remove_low_variance_feature(X):
+    sel = VarianceThreshold()
+    return sel.fit_transform(X)
+    
+    
+def filter_feature_selection(X_train_norm, y_train, X_test_norm, score_function):
+    best_k = SelectKBest(score_func=score_function, k=300)
+    fit = best_k.fit(X_train_norm, y_train)
+    X_train_fs = fit.transform(X_train_norm)
+    X_test_fs = fit.transform(X_test_norm)
+    
+    # DONOT REMOVE 
+    # dfscores = pd.DataFrame(fit.scores_)
+    # dfcolumns = pd.DataFrame(X_train.columns)
+    # featureScores = pd.concat([dfcolumns, dfscores], axis=1)
+    # featureScores.columns = ['Features','Score']
+    # best = featureScores.nlargest(40,'Score')
+    # for f in best['Features']:
+    #     featureVoting[f] = featureVoting.get(f, 0) + 1 
+    #best.plot(x='Features', y="Score", kind="bar")
+    return X_train_fs, X_test_fs
+
+def forward_feature_selecion(X_train_fs, y_train, X_test_fs):
+    sfs = SFS(MODEL, 
+		k_features=20,
+		forward=True,
+		floating = False,
+        verbose=2,
+		scoring = 'accuracy',
+		cv = 5)
+    fit = sfs.fit(X_train_fs, y_train)
+    print(fit)
+    X_train_wfs = fit.transform(X_train_fs)
+    X_test_wfs = fit.transform(X_test_fs)
+    
+    best = sfs.k_feature_names_     # to get the final set of features
+    #print(best)
+
+    return X_train_wfs, X_test_wfs
+
+def backward_feature_selection(X_train_fs, y_train, X_test_fs):
+    sbs = SFS(MODEL, 
+		k_features=20,
+		forward=False,
+		floating = False,
+        verbose=2,
+		scoring = 'accuracy',
+		cv = 5)
+    fit = sfs.fit(X_train_fs, y_train)
+    #print(fit)
+    X_train_wfs = fit.transform(X_train_fs)
+    X_test_wfs = fit.transform(X_test_fs)
+    
+    best = sbs.k_feature_names_     # to get the final set of features
+    #print(best)
+
+    return X_train_wfs, X_test_wfs
+
+    
+
+def get_performace_measures(model, X_train, X_test, y_train, y_test, Accuracy_list):
+    model = OneVsRestClassifier(model).fit(X_train, y_train)
+    y_pred = model.predict(X_test)
+
+    # Accuracy_list.append(accuracy_score(y_test, y_pred))
+
+    yT = model.label_binarizer_.transform(y_test).toarray().T
+    # Iterate through all L classifiers
+    print("------------------------------")
+    for i, (classifier, is_ith_class) in enumerate(zip(model.estimators_, yT)):
+        print("Accuracy of", 6+i, "vs Rest: ", round(classifier.score(X_test, is_ith_class), 4))
+        Accuracy_list[i] += classifier.score(X_test, is_ith_class)
+    print('\n\n')    
+
+
+# Currenlty not in use
+def dimensionality_reduction(X,y):
+        pca = KernelPCA(kernel='rbf')
+        #pca_reduced_data = pca.fit_transform(X,y)
+        X_transformed = pca.fit_transform(X)
+        return X_transformed
+        #lda = LinearDiscriminantAnalysis()
+        #pca_lda_data = lda.fit_transform(pca_reduced_data,y)
+       
+
+X, y = read_data('prad_tcga_genes.csv', 'prad_tcga_clinical_data.csv')
+
+#Resampling dataset, since our data is imbalanced 
+sme = SMOTEENN(random_state=42,smote=SMOTE(random_state=42, k_neighbors=1))
+X, y = sme.fit_resample(X, y)
+print('Resampling of dataset using SMOTEENN %s' % Counter(y), '\n')
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state=42)
+
+X_train_norm, X_test_norm = prepare_inputs(X_train, X_test)
+
+#--------------------------------------------------------------#
+            #Comparative Analysis 
+#--------------------------------------------------------------#
+
+print('\nLDA Classification\n')
+
+#1
+#print('((Chi2: 300 features))-->((Forward_Selection: 20- features))')
+#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, chi2)
+#X_train_wfs, X_test_wfs = forward_feature_selecion(X_train_fs, y_train, X_test_fs)
+
+#Accuracy_list =  [0, 0, 0, 0, 0]
+#get_performace_measures(MODEL_LDA, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
+
+#2
+print('((Mutual info: 300 features))-->((Forward_Selection: 20- features))')
+X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
+X_train_wfs, X_test_wfs = forward_feature_selecion(X_train_fs, y_train, X_test_fs)
+
+Accuracy_list =  [0, 0, 0, 0, 0]
+get_performace_measures(MODEL_LDA, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
+
+#3
+#print('((Chi2: 300 features))-->((Backward_Selection: 20- features))')
+#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, chi2)
+#X_train_wfs, X_test_wfs = backward_feature_selecion(X_train_fs, y_train, X_test_fs)
+
+#Accuracy_list =  [0, 0, 0, 0, 0]
+#get_performace_measures(MODEL_LDA, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
+
+#4
+#print('((Mutual info: 300 features))-->((Backward_Selection: 20- features))')
+#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
+#X_train_wfs, X_test_wfs = backward_feature_selecion(X_train_fs, y_train, X_test_fs)
+
+#Accuracy_list =  [0, 0, 0, 0, 0]
+#get_performace_measures(MODEL_LDA, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
+
+#print('\nNaive Bayes Classification\n')
+
+#5
+#print('((Chi2: 300 features))-->((Forward_Selection: 20- features))')
+#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, chi2)
+#X_train_wfs, X_test_wfs = forward_feature_selecion(X_train_fs, y_train, X_test_fs)
+
+#Accuracy_list =  [0, 0, 0, 0, 0]
+#get_performace_measures(MODEL_NB, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
+
+#6
+#print('((Mutual info: 300 features))-->((Forward_Selection: 20- features))')
+#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
+#X_train_wfs, X_test_wfs = forward_feature_selecion(X_train_fs, y_train, X_test_fs)
+
+#Accuracy_list =  [0, 0, 0, 0, 0]
+#get_performace_measures(MODEL_NB, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
+
+#7
+#print('((Chi2: 300 features))-->((Backward_Selection: 20- features))')
+#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
+#X_train_wfs, X_test_wfs = backward_feature_selecion(X_train_fs, y_train, X_test_fs)
+
+#Accuracy_list =  [0, 0, 0, 0, 0]
+#get_performace_measures(MODEL_NB, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
+
+#8
+#print('((Mutual info: 300 features))-->((Backward_Selection: 20- features))')
+#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
+#X_train_wfs, X_test_wfs = backward_feature_selecion(X_train_fs, y_train, X_test_fs)
+
+#Accuracy_list =  [0, 0, 0, 0, 0]
+#get_performace_measures(MODEL_NB, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
+
+#print('\nRandom Forest Classification\n')
+
+#9
+#print('((Chi2: 300 features))-->((Forward_Selection: 20- features))')
+#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, chi2)
+#X_train_wfs, X_test_wfs = forward_feature_selecion(X_train_fs, y_train, X_test_fs)
+
+#Accuracy_list =  [0, 0, 0, 0, 0]
+#get_performace_measures(MODEL_RF, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
+
+#10
+#print('((Mutual info: 300 features))-->((Forward_Selection: 20- features))')
+#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
+#X_train_wfs, X_test_wfs = forward_feature_selecion(X_train_fs, y_train, X_test_fs)
+
+#Accuracy_list =  [0, 0, 0, 0, 0]
+#get_performace_measures(MODEL_RF, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
+
+#11
+#print('((Chi2: 300 features))-->((Backward_Selection: 20- features))')
+#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
+#X_train_wfs, X_test_wfs = backward_feature_selecion(X_train_fs, y_train, X_test_fs)
+
+#Accuracy_list =  [0, 0, 0, 0, 0]
+#get_performace_measures(MODEL_RF, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
+
+#12
+#print('((Mutual info: 300 features))-->((Backward_Selection: 20- features))')
+#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
+#X_train_wfs, X_test_wfs = backward_feature_selecion(X_train_fs, y_train, X_test_fs)
+
+#Accuracy_list =  [0, 0, 0, 0, 0]
+#get_performace_measures(MODEL_RF, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)