ucec_ml_grade2021 / Git / [5f3b2d] /part2_Python

Models:
DanielG/
ucec_ml_grade2021
Downloads: 1
[5f3b2d]: / part2_Python_ML.py
History
Download this file
400 lines (324 with data), 15.6 kB

# -*- coding: utf-8 -*-
"""
Created on Fri May 28 13:53:46 2021

@author: Peter
"""
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 29 21:12:19 2020

@author: Peter
"""
#                     PART1: nested-cv elasticnet
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegressionCV
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import plot_roc_curve
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel
import dill
import statistics
from sklearn.metrics import roc_curve
import scipy

szempont="roc_auc"
random_state=1
cv_outer = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=random_state)
cv=5
test_size=0.2
l1_ratios=[0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9]

os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus")

features = pd.read_table('uterus_rnaseq_VST.txt', sep="\t", index_col=0)
features = features[features['label']!="G2"]
features=features.replace("G1",0)
features=features.replace("G3",1)
print(features["label"].value_counts())

labels = np.array(features['label'])

features= features.drop('label', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                    test_size = test_size, random_state = random_state)
X_train=pd.DataFrame(X_train)
X_test=pd.DataFrame(X_test)
print([X_train.shape,y_train.shape,X_test.shape,y_test.shape])

best_c=[]
best_l1=[]
max_auc=[]
valid_auc=[]
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
fig, ax = plt.subplots()
i=1
genes=[]

for train_idx, val_idx in tqdm(cv_outer.split(X_train, y_train)):
    train_data, val_data = X_train.iloc[train_idx], X_train.iloc[val_idx]
    train_target, val_target = y_train[train_idx], y_train[val_idx]
    
    best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet", 
                          fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000, 
                          l1_ratios=l1_ratios)
    best_model.fit(train_data, train_target)
    
    model = SelectFromModel(best_model, prefit=True, threshold=None)
    mask=model.get_support()
    train_data_genes = train_data.loc[:, mask]
    [genes.append(x) for x in train_data_genes.columns.values]
    coefs_array=+ best_model.coef_
    
    y_pred_prob = best_model.predict_proba(val_data)[:,1]
    valid_auc.append(roc_auc_score(val_target, y_pred_prob))
    best_c.append(best_model.C_)
    best_l1.append(best_model.l1_ratio_)
    max_auc.append(best_model.scores_[1].mean(axis=0).max())
    
    viz = plot_roc_curve(best_model, val_data, val_target,
                         name='ROC fold {}'.format(i), alpha=0.5, lw=1, ax=ax)
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
    i=i+1


dill.dump_session('globalsave_part1.pkl')

ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = metrics.auc(mean_fpr, mean_tpr)
print("Mean AUC: ", mean_auc)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic")
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/crossval_roc.pdf")

fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic")
ax.legend(loc="lower right")
plt.xlabel("False Positive Rate", fontsize=14)
plt.ylabel("True Positive Rate", fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/crossval_roc_mean_only.pdf")
plt.close('all')


res=pd.DataFrame({"c":best_c, "l1":best_l1, "max auc":max_auc, "valid auc":valid_auc})
print(res)
print(statistics.mean(max_auc))
print(statistics.mean(valid_auc))

#%%
#               PART2: retrain model on whole dataset, predict G2
import os
os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus")

import dill                          
dill.load_session('globalsave_part1.pkl')


features = pd.read_table('uterus_rnaseq_VST.txt', sep="\t", index_col=0)
features = features[features['label']!="G2"]
features=features.replace("G1",0)
features=features.replace("G3",1)
print(features["label"].value_counts())

labels = np.array(features['label'])
features= features.drop('label', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(features, labels,
                                                    test_size = test_size, random_state = random_state)
X_train=pd.DataFrame(X_train)
X_test=pd.DataFrame(X_test)
print([X_train.shape,y_train.shape,X_test.shape,y_test.shape])

best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet", 
                          fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000, 
                          l1_ratios=l1_ratios)
best_model.fit(X_train,y_train)
print("Best C: ", best_model.C_)
print("Best l1_ratio: ", best_model.l1_ratio_)
print ('Max auc_roc:', best_model.scores_[1].mean(axis=0).max())

dill.dump_session('globalsave_part2.pkl')

y_pred = best_model.predict(X_train)
y_pred_proba=best_model.predict_proba(X_train)[:,1]
fpr, tpr, thresholds = roc_curve(y_train, y_pred_proba, drop_intermediate=True)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))
print("Accuracy train: ", accuracy_score(y_train, y_pred))
print("AUC train: ", roc_auc_score(y_train, y_pred_proba))
print("Threshold value is:", optimal_threshold)
for i in range(len(y_pred)):
    if y_pred_proba[i]<optimal_threshold:
        y_pred[i]=0
    else: y_pred[i]=1

print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))
print("Accuracy train after thresholding: ", accuracy_score(y_train, y_pred))


y_pred = best_model.predict(X_test)
y_pred_proba=best_model.predict_proba(X_test)[:,1]
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("Accuracy test: ", accuracy_score(y_test, y_pred))
print("AUC test: ", roc_auc_score(y_test, y_pred_proba))
for i in range(len(y_pred)):
    if y_pred_proba[i]<optimal_threshold:
        y_pred[i]=0
    else: y_pred[i]=1

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("Accuracy test after thresholding: ", accuracy_score(y_test, y_pred))


y_pred_proba=best_model.predict_proba(X_test)[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_proba)
roc_auc = metrics.auc(fpr, tpr)

fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
ax.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic")
ax.legend(loc = 'lower right')
plt.xlabel("False Positive Rate", fontsize=14)
plt.ylabel("True Positive Rate", fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/test_roc.pdf")
plt.close('all')



features = pd.read_table('uterus_rnaseq_VST_G2.txt', sep="\t", index_col=0)
print(features["label"].value_counts())
y_test = np.array(features['label'])
X_test= features.drop('label', axis = 1)

y_pred_proba=pd.DataFrame(best_model.predict_proba(X_test)[:,1])
y_pred_proba.columns=["pred_proba"]
y_pred_proba["samples"]=X_test.index.values
y_pred_proba.to_csv("/data10/working_groups/balint_group/gargya.peter/R/uterus/G2_preds.txt", sep='\t',
                    index=False)

#%%
#                 PART3: important overlapping genes during CV-s
import os
os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus")

import dill                          
dill.load_session('globalsave_part1.pkl')

big_model_AUCs=valid_auc
coefs_array=coefs_array/100
total_coefs=np.absolute(coefs_array).sum()
coefs_pd=pd.DataFrame(data=np.absolute(coefs_array), columns=X_train.columns.values).transpose().sort_values(0, axis=0, ascending=False)

toplot=pd.DataFrame(data=coefs_array, columns=X_train.columns.values).transpose()
toplot.columns=["coefs"]
toplot["abs"]=np.absolute(coefs_array).transpose()
toplot=toplot.sort_values("abs", axis=0, ascending=False).head(12)
toplot.to_csv("to_plot_barchart.txt", index=True, sep="\t")

ori_X_train=pd.DataFrame(X_train)
ori_X_test=pd.DataFrame(X_test)
loop_auc=[]
loop_auc_test=[]
loop_pvals=[]
loop_cum_coef=[]

for i in range(1,200):
    valid_auc=[]
    valid_auc_test=[]
    gene_mask=coefs_pd.index.values[:i]
    X_train=ori_X_train[gene_mask]
    X_test=ori_X_test[gene_mask]
                                
    for train_idx, val_idx in tqdm(cv_outer.split(X_train, y_train)):
        train_data, val_data = X_train.iloc[train_idx], X_train.iloc[val_idx]
        train_target, val_target = y_train[train_idx], y_train[val_idx]
        
        best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet",
                          fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000, 
                          l1_ratios=l1_ratios)
        best_model.fit(train_data, train_target)
        
        y_pred_prob = best_model.predict_proba(val_data)[:,1]
        auc = roc_auc_score(val_target, y_pred_prob)
        valid_auc.append(auc)
        mean_auc=statistics.mean(valid_auc)
        
        best_model.fit(X_train, y_train)
        y_pred_prob = best_model.predict_proba(X_test)[:,1]
        auc_test = roc_auc_score(y_test, y_pred_prob)
        valid_auc_test.append(auc_test)
        mean_auc_test=statistics.mean(valid_auc_test)
        
    loop_auc.append(mean_auc)
    loop_auc_test.append(mean_auc_test)
    loop_pvals.append(scipy.stats.wilcoxon(big_model_AUCs, valid_auc, alternative="two-sided").pvalue)
    loop_cum_coef.append(coefs_pd[0][0:i].sum())


dill.dump_session('globalsave_part3.pkl')

data=pd.DataFrame(loop_auc, columns=["auc"])
data["auc_test"]=loop_auc_test
data["pval"]=loop_pvals
data["cum_coef"]=loop_cum_coef
data.to_csv("results.txt", index=False, sep="\t")

fig, ax = plt.subplots()
ax.plot(range(1,200), loop_auc, "brown", label="Mean AUCs")
ax.plot(range(1,200), loop_auc_test, "green", label="Test AUCs")
plt.ylabel('AUC scores', fontsize=14)
plt.xlabel('Number of genes', fontsize=14)
plt.legend()
plt.title("Results of iterative analysis")
ax.tick_params(axis='both', which='major', labelsize=14)
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/loop_AUCs_min_num_genes.pdf")
plt.close('all')



#%%
#                   PART4: results with min num genes
import os
os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus")

import dill                          
dill.load_session('globalsave_part1.pkl')

big_model_AUCs=valid_auc
coefs_pd=pd.DataFrame(data=np.absolute(coefs_array), columns=X_train.columns.values).transpose().sort_values(0, axis=0, ascending=False)

gene_mask=coefs_pd.index.values[:12]
print(gene_mask)
X_train=X_train[gene_mask]
X_test=X_test[gene_mask]

best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet",
                                  fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000, 
                                  l1_ratios=l1_ratios)
best_model.fit(X_train, y_train)
print("Best C: ", best_model.C_)
print("Best l1_ratio: ", best_model.l1_ratio_)
print ('Max auc_roc:', best_model.scores_[1].mean(axis=0).max())


y_pred = best_model.predict(X_train)
y_pred_proba=best_model.predict_proba(X_train)[:,1]
fpr, tpr, thresholds = roc_curve(y_train, y_pred_proba, drop_intermediate=True)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))
print("Accuracy train: ", accuracy_score(y_train, y_pred))
print("AUC train: ", roc_auc_score(y_train, y_pred_proba))
print("Threshold value is:", optimal_threshold)
for i in range(len(y_pred)):
    if y_pred_proba[i]<optimal_threshold:
        y_pred[i]=0
    else: y_pred[i]=1

print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))
print("Accuracy train after thresholding: ", accuracy_score(y_train, y_pred))


train_pred_proba=pd.DataFrame(y_pred_proba)
train_pred_proba["samples"]=X_train.index.values


y_pred = best_model.predict(X_test)
y_pred_proba=best_model.predict_proba(X_test)[:,1]
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("Accuracy test: ", accuracy_score(y_test, y_pred))
print("AUC test: ", roc_auc_score(y_test, y_pred_proba))
for i in range(len(y_pred)):
    if y_pred_proba[i]<optimal_threshold:
        y_pred[i]=0
    else: y_pred[i]=1

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("Accuracy test after thresholding: ", accuracy_score(y_test, y_pred))


y_pred_proba=best_model.predict_proba(X_test)[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_proba)
roc_auc = metrics.auc(fpr, tpr)
fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
ax.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic")
ax.legend(loc = 'lower right')
plt.xlabel("False Positive Rate", fontsize=14)
plt.ylabel("True Positive Rate", fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/test_roc_with_min_num_genes.pdf")
plt.close('all')


features = pd.read_table('uterus_rnaseq_VST_G2.txt', sep="\t", index_col=0)
print(features["label"].value_counts())
y_test = np.array(features['label'])
X_test= features.drop('label', axis = 1)

X_test=X_test[gene_mask]

y_pred_proba=pd.DataFrame(best_model.predict_proba(X_test)[:,1])
y_pred_proba.columns=["pred_proba"]
y_pred_proba["samples"]=X_test.index.values
y_pred_proba.to_csv("/data10/working_groups/balint_group/gargya.peter/R/uterus/G2_preds_with_mingenes.txt", sep='\t',
                    index=False)