import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import svm
from sklearn.decomposition import PCA, KernelPCA
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from statistics import mean
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
# MODEL = OneVsRestClassifier(svm.SVC(kernel='linear'))
MODEL_LDA = LinearDiscriminantAnalysis()
MODEL_NB = GaussianNB()
MODEL_RF = RandomForestClassifier()
def load_genomic_data(filename):
# load the geneomic data
genomic_df = pd.read_csv(filename)
# Setting index to first column, else it will add its own indexing while doing transpose
genomic_df.set_index('ID', inplace = True)
# Need to take transpose since I want genes/features to be columns and each row should represent a patient information
genomic_df = genomic_df.T
# removing features with only zero values for all patients
return genomic_df.loc[:, (genomic_df != 0).any(axis = 0)]
def read_data(gfilename, cfilename):
# Feature set, load geonomic data
X = load_genomic_data(gfilename)
# load the clinical data
clinical_df = pd.read_csv(cfilename)
print("Shape of genomic data: ", X.shape, " and Shape of clinical data: ", clinical_df.shape, "thus looks like we donot have genetic data for 5 patients, hence removing them")
clinical_df = clinical_df.drop(labels=[213,227,297,371,469], axis=0)
print("After droping 5 patients whose data were missing:\nShape of genomic data: ", X.shape, " and Shape of clinical data: ", clinical_df.shape, "\n")
print("-- Checking if all patient ID's in genetic data set and clinical dataset matches\n")
if(X.index.all() == clinical_df['PATIENT_ID'].all()):
print("-- Yes, patient ID's in genetic data set and clinical dataset matches\n")
else:
print("Nope, patient ID's in genetic data set and clinical dataset do not match")
y = clinical_df['GLEASON_SCORE']
return X, y
def visualize_data(X, y, title):
# Visualizing dataset for outliers, using PCA prioir to LDA to prevent overfitting (https://stats.stackexchange.com/q/109810)
pca = PCA(n_components=10)
pca_reduced_data = pca.fit_transform(X,y)
lda = LinearDiscriminantAnalysis(n_components = 2)
pca_lda_reduced_data = lda.fit_transform(pca_reduced_data, y)
# NOTE: Gleason score ranges from 6-10
label = [6, 7, 8, 9, 10]
colors = ['red','green','blue','purple','pink']
fig = plt.figure(figsize=(6,6))
plt.scatter(pca_lda_reduced_data[:,0], pca_lda_reduced_data[:,1], c=y, cmap=matplotlib.colors.ListedColormap(colors), alpha=0.7)
plt.title(title)
def prepare_inputs(X_train, X_test):
scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)
return X_train_norm, X_test_norm
def remove_low_variance_feature(X):
sel = VarianceThreshold()
return sel.fit_transform(X)
def filter_feature_selection(X_train_norm, y_train, X_test_norm, score_function):
best_k = SelectKBest(score_func=score_function, k=300)
fit = best_k.fit(X_train_norm, y_train)
X_train_fs = fit.transform(X_train_norm)
X_test_fs = fit.transform(X_test_norm)
# DONOT REMOVE
# dfscores = pd.DataFrame(fit.scores_)
# dfcolumns = pd.DataFrame(X_train.columns)
# featureScores = pd.concat([dfcolumns, dfscores], axis=1)
# featureScores.columns = ['Features','Score']
# best = featureScores.nlargest(40,'Score')
# for f in best['Features']:
# featureVoting[f] = featureVoting.get(f, 0) + 1
#best.plot(x='Features', y="Score", kind="bar")
return X_train_fs, X_test_fs
def forward_feature_selecion(X_train_fs, y_train, X_test_fs):
sfs = SFS(MODEL,
k_features=20,
forward=True,
floating = False,
verbose=2,
scoring = 'accuracy',
cv = 5)
fit = sfs.fit(X_train_fs, y_train)
print(fit)
X_train_wfs = fit.transform(X_train_fs)
X_test_wfs = fit.transform(X_test_fs)
best = sfs.k_feature_names_ # to get the final set of features
#print(best)
return X_train_wfs, X_test_wfs
def backward_feature_selection(X_train_fs, y_train, X_test_fs):
sbs = SFS(MODEL,
k_features=20,
forward=False,
floating = False,
verbose=2,
scoring = 'accuracy',
cv = 5)
fit = sfs.fit(X_train_fs, y_train)
#print(fit)
X_train_wfs = fit.transform(X_train_fs)
X_test_wfs = fit.transform(X_test_fs)
best = sbs.k_feature_names_ # to get the final set of features
#print(best)
return X_train_wfs, X_test_wfs
def get_performace_measures(model, X_train, X_test, y_train, y_test, Accuracy_list):
model = OneVsRestClassifier(model).fit(X_train, y_train)
y_pred = model.predict(X_test)
# Accuracy_list.append(accuracy_score(y_test, y_pred))
yT = model.label_binarizer_.transform(y_test).toarray().T
# Iterate through all L classifiers
print("------------------------------")
for i, (classifier, is_ith_class) in enumerate(zip(model.estimators_, yT)):
print("Accuracy of", 6+i, "vs Rest: ", round(classifier.score(X_test, is_ith_class), 4))
Accuracy_list[i] += classifier.score(X_test, is_ith_class)
print('\n\n')
# Currenlty not in use
def dimensionality_reduction(X,y):
pca = KernelPCA(kernel='rbf')
#pca_reduced_data = pca.fit_transform(X,y)
X_transformed = pca.fit_transform(X)
return X_transformed
#lda = LinearDiscriminantAnalysis()
#pca_lda_data = lda.fit_transform(pca_reduced_data,y)
X, y = read_data('prad_tcga_genes.csv', 'prad_tcga_clinical_data.csv')
#Resampling dataset, since our data is imbalanced
sme = SMOTEENN(random_state=42,smote=SMOTE(random_state=42, k_neighbors=1))
X, y = sme.fit_resample(X, y)
print('Resampling of dataset using SMOTEENN %s' % Counter(y), '\n')
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state=42)
X_train_norm, X_test_norm = prepare_inputs(X_train, X_test)
#--------------------------------------------------------------#
#Comparative Analysis
#--------------------------------------------------------------#
print('\nLDA Classification\n')
#1
#print('((Chi2: 300 features))-->((Forward_Selection: 20- features))')
#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, chi2)
#X_train_wfs, X_test_wfs = forward_feature_selecion(X_train_fs, y_train, X_test_fs)
#Accuracy_list = [0, 0, 0, 0, 0]
#get_performace_measures(MODEL_LDA, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
#2
print('((Mutual info: 300 features))-->((Forward_Selection: 20- features))')
X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
X_train_wfs, X_test_wfs = forward_feature_selecion(X_train_fs, y_train, X_test_fs)
Accuracy_list = [0, 0, 0, 0, 0]
get_performace_measures(MODEL_LDA, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
#3
#print('((Chi2: 300 features))-->((Backward_Selection: 20- features))')
#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, chi2)
#X_train_wfs, X_test_wfs = backward_feature_selecion(X_train_fs, y_train, X_test_fs)
#Accuracy_list = [0, 0, 0, 0, 0]
#get_performace_measures(MODEL_LDA, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
#4
#print('((Mutual info: 300 features))-->((Backward_Selection: 20- features))')
#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
#X_train_wfs, X_test_wfs = backward_feature_selecion(X_train_fs, y_train, X_test_fs)
#Accuracy_list = [0, 0, 0, 0, 0]
#get_performace_measures(MODEL_LDA, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
#print('\nNaive Bayes Classification\n')
#5
#print('((Chi2: 300 features))-->((Forward_Selection: 20- features))')
#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, chi2)
#X_train_wfs, X_test_wfs = forward_feature_selecion(X_train_fs, y_train, X_test_fs)
#Accuracy_list = [0, 0, 0, 0, 0]
#get_performace_measures(MODEL_NB, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
#6
#print('((Mutual info: 300 features))-->((Forward_Selection: 20- features))')
#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
#X_train_wfs, X_test_wfs = forward_feature_selecion(X_train_fs, y_train, X_test_fs)
#Accuracy_list = [0, 0, 0, 0, 0]
#get_performace_measures(MODEL_NB, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
#7
#print('((Chi2: 300 features))-->((Backward_Selection: 20- features))')
#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
#X_train_wfs, X_test_wfs = backward_feature_selecion(X_train_fs, y_train, X_test_fs)
#Accuracy_list = [0, 0, 0, 0, 0]
#get_performace_measures(MODEL_NB, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
#8
#print('((Mutual info: 300 features))-->((Backward_Selection: 20- features))')
#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
#X_train_wfs, X_test_wfs = backward_feature_selecion(X_train_fs, y_train, X_test_fs)
#Accuracy_list = [0, 0, 0, 0, 0]
#get_performace_measures(MODEL_NB, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
#print('\nRandom Forest Classification\n')
#9
#print('((Chi2: 300 features))-->((Forward_Selection: 20- features))')
#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, chi2)
#X_train_wfs, X_test_wfs = forward_feature_selecion(X_train_fs, y_train, X_test_fs)
#Accuracy_list = [0, 0, 0, 0, 0]
#get_performace_measures(MODEL_RF, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
#10
#print('((Mutual info: 300 features))-->((Forward_Selection: 20- features))')
#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
#X_train_wfs, X_test_wfs = forward_feature_selecion(X_train_fs, y_train, X_test_fs)
#Accuracy_list = [0, 0, 0, 0, 0]
#get_performace_measures(MODEL_RF, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
#11
#print('((Chi2: 300 features))-->((Backward_Selection: 20- features))')
#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
#X_train_wfs, X_test_wfs = backward_feature_selecion(X_train_fs, y_train, X_test_fs)
#Accuracy_list = [0, 0, 0, 0, 0]
#get_performace_measures(MODEL_RF, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)
#12
#print('((Mutual info: 300 features))-->((Backward_Selection: 20- features))')
#X_train_fs, X_test_fs = filter_feature_selection(X_train_norm, y_train, X_test_norm, mutual_info_classif)
#X_train_wfs, X_test_wfs = backward_feature_selecion(X_train_fs, y_train, X_test_fs)
#Accuracy_list = [0, 0, 0, 0, 0]
#get_performace_measures(MODEL_RF, X_train_wfs, X_test_wfs, y_train, y_test, Accuracy_list)