# -*- coding: utf-8 -*-
"""
Created on Fri May 28 13:53:46 2021
@author: Peter
"""
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 29 21:12:19 2020
@author: Peter
"""
# PART1: nested-cv elasticnet
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegressionCV
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import plot_roc_curve
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel
import dill
import statistics
from sklearn.metrics import roc_curve
import scipy
szempont="roc_auc"
random_state=1
cv_outer = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=random_state)
cv=5
test_size=0.2
l1_ratios=[0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9]
os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus")
features = pd.read_table('uterus_rnaseq_VST.txt', sep="\t", index_col=0)
features = features[features['label']!="G2"]
features=features.replace("G1",0)
features=features.replace("G3",1)
print(features["label"].value_counts())
labels = np.array(features['label'])
features= features.drop('label', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(features, labels,
test_size = test_size, random_state = random_state)
X_train=pd.DataFrame(X_train)
X_test=pd.DataFrame(X_test)
print([X_train.shape,y_train.shape,X_test.shape,y_test.shape])
best_c=[]
best_l1=[]
max_auc=[]
valid_auc=[]
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
fig, ax = plt.subplots()
i=1
genes=[]
for train_idx, val_idx in tqdm(cv_outer.split(X_train, y_train)):
train_data, val_data = X_train.iloc[train_idx], X_train.iloc[val_idx]
train_target, val_target = y_train[train_idx], y_train[val_idx]
best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet",
fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000,
l1_ratios=l1_ratios)
best_model.fit(train_data, train_target)
model = SelectFromModel(best_model, prefit=True, threshold=None)
mask=model.get_support()
train_data_genes = train_data.loc[:, mask]
[genes.append(x) for x in train_data_genes.columns.values]
coefs_array=+ best_model.coef_
y_pred_prob = best_model.predict_proba(val_data)[:,1]
valid_auc.append(roc_auc_score(val_target, y_pred_prob))
best_c.append(best_model.C_)
best_l1.append(best_model.l1_ratio_)
max_auc.append(best_model.scores_[1].mean(axis=0).max())
viz = plot_roc_curve(best_model, val_data, val_target,
name='ROC fold {}'.format(i), alpha=0.5, lw=1, ax=ax)
interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
interp_tpr[0] = 0.0
tprs.append(interp_tpr)
aucs.append(viz.roc_auc)
i=i+1
dill.dump_session('globalsave_part1.pkl')
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = metrics.auc(mean_fpr, mean_tpr)
print("Mean AUC: ", mean_auc)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic")
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/crossval_roc.pdf")
fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic")
ax.legend(loc="lower right")
plt.xlabel("False Positive Rate", fontsize=14)
plt.ylabel("True Positive Rate", fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/crossval_roc_mean_only.pdf")
plt.close('all')
res=pd.DataFrame({"c":best_c, "l1":best_l1, "max auc":max_auc, "valid auc":valid_auc})
print(res)
print(statistics.mean(max_auc))
print(statistics.mean(valid_auc))
#%%
# PART2: retrain model on whole dataset, predict G2
import os
os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus")
import dill
dill.load_session('globalsave_part1.pkl')
features = pd.read_table('uterus_rnaseq_VST.txt', sep="\t", index_col=0)
features = features[features['label']!="G2"]
features=features.replace("G1",0)
features=features.replace("G3",1)
print(features["label"].value_counts())
labels = np.array(features['label'])
features= features.drop('label', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(features, labels,
test_size = test_size, random_state = random_state)
X_train=pd.DataFrame(X_train)
X_test=pd.DataFrame(X_test)
print([X_train.shape,y_train.shape,X_test.shape,y_test.shape])
best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet",
fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000,
l1_ratios=l1_ratios)
best_model.fit(X_train,y_train)
print("Best C: ", best_model.C_)
print("Best l1_ratio: ", best_model.l1_ratio_)
print ('Max auc_roc:', best_model.scores_[1].mean(axis=0).max())
dill.dump_session('globalsave_part2.pkl')
y_pred = best_model.predict(X_train)
y_pred_proba=best_model.predict_proba(X_train)[:,1]
fpr, tpr, thresholds = roc_curve(y_train, y_pred_proba, drop_intermediate=True)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))
print("Accuracy train: ", accuracy_score(y_train, y_pred))
print("AUC train: ", roc_auc_score(y_train, y_pred_proba))
print("Threshold value is:", optimal_threshold)
for i in range(len(y_pred)):
if y_pred_proba[i]<optimal_threshold:
y_pred[i]=0
else: y_pred[i]=1
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))
print("Accuracy train after thresholding: ", accuracy_score(y_train, y_pred))
y_pred = best_model.predict(X_test)
y_pred_proba=best_model.predict_proba(X_test)[:,1]
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("Accuracy test: ", accuracy_score(y_test, y_pred))
print("AUC test: ", roc_auc_score(y_test, y_pred_proba))
for i in range(len(y_pred)):
if y_pred_proba[i]<optimal_threshold:
y_pred[i]=0
else: y_pred[i]=1
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("Accuracy test after thresholding: ", accuracy_score(y_test, y_pred))
y_pred_proba=best_model.predict_proba(X_test)[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_proba)
roc_auc = metrics.auc(fpr, tpr)
fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
ax.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic")
ax.legend(loc = 'lower right')
plt.xlabel("False Positive Rate", fontsize=14)
plt.ylabel("True Positive Rate", fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/test_roc.pdf")
plt.close('all')
features = pd.read_table('uterus_rnaseq_VST_G2.txt', sep="\t", index_col=0)
print(features["label"].value_counts())
y_test = np.array(features['label'])
X_test= features.drop('label', axis = 1)
y_pred_proba=pd.DataFrame(best_model.predict_proba(X_test)[:,1])
y_pred_proba.columns=["pred_proba"]
y_pred_proba["samples"]=X_test.index.values
y_pred_proba.to_csv("/data10/working_groups/balint_group/gargya.peter/R/uterus/G2_preds.txt", sep='\t',
index=False)
#%%
# PART3: important overlapping genes during CV-s
import os
os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus")
import dill
dill.load_session('globalsave_part1.pkl')
big_model_AUCs=valid_auc
coefs_array=coefs_array/100
total_coefs=np.absolute(coefs_array).sum()
coefs_pd=pd.DataFrame(data=np.absolute(coefs_array), columns=X_train.columns.values).transpose().sort_values(0, axis=0, ascending=False)
toplot=pd.DataFrame(data=coefs_array, columns=X_train.columns.values).transpose()
toplot.columns=["coefs"]
toplot["abs"]=np.absolute(coefs_array).transpose()
toplot=toplot.sort_values("abs", axis=0, ascending=False).head(12)
toplot.to_csv("to_plot_barchart.txt", index=True, sep="\t")
ori_X_train=pd.DataFrame(X_train)
ori_X_test=pd.DataFrame(X_test)
loop_auc=[]
loop_auc_test=[]
loop_pvals=[]
loop_cum_coef=[]
for i in range(1,200):
valid_auc=[]
valid_auc_test=[]
gene_mask=coefs_pd.index.values[:i]
X_train=ori_X_train[gene_mask]
X_test=ori_X_test[gene_mask]
for train_idx, val_idx in tqdm(cv_outer.split(X_train, y_train)):
train_data, val_data = X_train.iloc[train_idx], X_train.iloc[val_idx]
train_target, val_target = y_train[train_idx], y_train[val_idx]
best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet",
fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000,
l1_ratios=l1_ratios)
best_model.fit(train_data, train_target)
y_pred_prob = best_model.predict_proba(val_data)[:,1]
auc = roc_auc_score(val_target, y_pred_prob)
valid_auc.append(auc)
mean_auc=statistics.mean(valid_auc)
best_model.fit(X_train, y_train)
y_pred_prob = best_model.predict_proba(X_test)[:,1]
auc_test = roc_auc_score(y_test, y_pred_prob)
valid_auc_test.append(auc_test)
mean_auc_test=statistics.mean(valid_auc_test)
loop_auc.append(mean_auc)
loop_auc_test.append(mean_auc_test)
loop_pvals.append(scipy.stats.wilcoxon(big_model_AUCs, valid_auc, alternative="two-sided").pvalue)
loop_cum_coef.append(coefs_pd[0][0:i].sum())
dill.dump_session('globalsave_part3.pkl')
data=pd.DataFrame(loop_auc, columns=["auc"])
data["auc_test"]=loop_auc_test
data["pval"]=loop_pvals
data["cum_coef"]=loop_cum_coef
data.to_csv("results.txt", index=False, sep="\t")
fig, ax = plt.subplots()
ax.plot(range(1,200), loop_auc, "brown", label="Mean AUCs")
ax.plot(range(1,200), loop_auc_test, "green", label="Test AUCs")
plt.ylabel('AUC scores', fontsize=14)
plt.xlabel('Number of genes', fontsize=14)
plt.legend()
plt.title("Results of iterative analysis")
ax.tick_params(axis='both', which='major', labelsize=14)
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/loop_AUCs_min_num_genes.pdf")
plt.close('all')
#%%
# PART4: results with min num genes
import os
os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus")
import dill
dill.load_session('globalsave_part1.pkl')
big_model_AUCs=valid_auc
coefs_pd=pd.DataFrame(data=np.absolute(coefs_array), columns=X_train.columns.values).transpose().sort_values(0, axis=0, ascending=False)
gene_mask=coefs_pd.index.values[:12]
print(gene_mask)
X_train=X_train[gene_mask]
X_test=X_test[gene_mask]
best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet",
fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000,
l1_ratios=l1_ratios)
best_model.fit(X_train, y_train)
print("Best C: ", best_model.C_)
print("Best l1_ratio: ", best_model.l1_ratio_)
print ('Max auc_roc:', best_model.scores_[1].mean(axis=0).max())
y_pred = best_model.predict(X_train)
y_pred_proba=best_model.predict_proba(X_train)[:,1]
fpr, tpr, thresholds = roc_curve(y_train, y_pred_proba, drop_intermediate=True)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))
print("Accuracy train: ", accuracy_score(y_train, y_pred))
print("AUC train: ", roc_auc_score(y_train, y_pred_proba))
print("Threshold value is:", optimal_threshold)
for i in range(len(y_pred)):
if y_pred_proba[i]<optimal_threshold:
y_pred[i]=0
else: y_pred[i]=1
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))
print("Accuracy train after thresholding: ", accuracy_score(y_train, y_pred))
train_pred_proba=pd.DataFrame(y_pred_proba)
train_pred_proba["samples"]=X_train.index.values
y_pred = best_model.predict(X_test)
y_pred_proba=best_model.predict_proba(X_test)[:,1]
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("Accuracy test: ", accuracy_score(y_test, y_pred))
print("AUC test: ", roc_auc_score(y_test, y_pred_proba))
for i in range(len(y_pred)):
if y_pred_proba[i]<optimal_threshold:
y_pred[i]=0
else: y_pred[i]=1
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("Accuracy test after thresholding: ", accuracy_score(y_test, y_pred))
y_pred_proba=best_model.predict_proba(X_test)[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_proba)
roc_auc = metrics.auc(fpr, tpr)
fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
ax.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic")
ax.legend(loc = 'lower right')
plt.xlabel("False Positive Rate", fontsize=14)
plt.ylabel("True Positive Rate", fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/test_roc_with_min_num_genes.pdf")
plt.close('all')
features = pd.read_table('uterus_rnaseq_VST_G2.txt', sep="\t", index_col=0)
print(features["label"].value_counts())
y_test = np.array(features['label'])
X_test= features.drop('label', axis = 1)
X_test=X_test[gene_mask]
y_pred_proba=pd.DataFrame(best_model.predict_proba(X_test)[:,1])
y_pred_proba.columns=["pred_proba"]
y_pred_proba["samples"]=X_test.index.values
y_pred_proba.to_csv("/data10/working_groups/balint_group/gargya.peter/R/uterus/G2_preds_with_mingenes.txt", sep='\t',
index=False)