a b/part2_Python_ML.py
1
# -*- coding: utf-8 -*-
2
"""
3
Created on Fri May 28 13:53:46 2021
4
5
@author: Peter
6
"""
7
# -*- coding: utf-8 -*-
8
"""
9
Created on Thu Oct 29 21:12:19 2020
10
11
@author: Peter
12
"""
13
#                     PART1: nested-cv elasticnet
14
import os
15
import pandas as pd
16
import numpy as np
17
from sklearn.model_selection import train_test_split
18
from sklearn.model_selection import StratifiedShuffleSplit
19
from sklearn.linear_model import LogisticRegressionCV
20
from tqdm import tqdm
21
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
22
import matplotlib.pyplot as plt
23
from sklearn.metrics import plot_roc_curve
24
from sklearn import metrics
25
from sklearn.feature_selection import SelectFromModel
26
import dill
27
import statistics
28
from sklearn.metrics import roc_curve
29
import scipy
30
31
szempont="roc_auc"
32
random_state=1
33
cv_outer = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=random_state)
34
cv=5
35
test_size=0.2
36
l1_ratios=[0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9]
37
38
os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus")
39
40
features = pd.read_table('uterus_rnaseq_VST.txt', sep="\t", index_col=0)
41
features = features[features['label']!="G2"]
42
features=features.replace("G1",0)
43
features=features.replace("G3",1)
44
print(features["label"].value_counts())
45
46
labels = np.array(features['label'])
47
48
features= features.drop('label', axis = 1)
49
X_train, X_test, y_train, y_test = train_test_split(features, labels, 
50
                                                    test_size = test_size, random_state = random_state)
51
X_train=pd.DataFrame(X_train)
52
X_test=pd.DataFrame(X_test)
53
print([X_train.shape,y_train.shape,X_test.shape,y_test.shape])
54
55
best_c=[]
56
best_l1=[]
57
max_auc=[]
58
valid_auc=[]
59
tprs = []
60
aucs = []
61
mean_fpr = np.linspace(0, 1, 100)
62
fig, ax = plt.subplots()
63
i=1
64
genes=[]
65
66
for train_idx, val_idx in tqdm(cv_outer.split(X_train, y_train)):
67
    train_data, val_data = X_train.iloc[train_idx], X_train.iloc[val_idx]
68
    train_target, val_target = y_train[train_idx], y_train[val_idx]
69
    
70
    best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet", 
71
                          fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000, 
72
                          l1_ratios=l1_ratios)
73
    best_model.fit(train_data, train_target)
74
    
75
    model = SelectFromModel(best_model, prefit=True, threshold=None)
76
    mask=model.get_support()
77
    train_data_genes = train_data.loc[:, mask]
78
    [genes.append(x) for x in train_data_genes.columns.values]
79
    coefs_array=+ best_model.coef_
80
    
81
    y_pred_prob = best_model.predict_proba(val_data)[:,1]
82
    valid_auc.append(roc_auc_score(val_target, y_pred_prob))
83
    best_c.append(best_model.C_)
84
    best_l1.append(best_model.l1_ratio_)
85
    max_auc.append(best_model.scores_[1].mean(axis=0).max())
86
    
87
    viz = plot_roc_curve(best_model, val_data, val_target,
88
                         name='ROC fold {}'.format(i), alpha=0.5, lw=1, ax=ax)
89
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
90
    interp_tpr[0] = 0.0
91
    tprs.append(interp_tpr)
92
    aucs.append(viz.roc_auc)
93
    i=i+1
94
95
96
dill.dump_session('globalsave_part1.pkl')
97
98
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
99
mean_tpr = np.mean(tprs, axis=0)
100
mean_tpr[-1] = 1.0
101
mean_auc = metrics.auc(mean_fpr, mean_tpr)
102
print("Mean AUC: ", mean_auc)
103
std_auc = np.std(aucs)
104
ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)
105
std_tpr = np.std(tprs, axis=0)
106
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
107
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
108
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')
109
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic")
110
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/crossval_roc.pdf")
111
112
fig, ax = plt.subplots()
113
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
114
ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)
115
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')
116
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic")
117
ax.legend(loc="lower right")
118
plt.xlabel("False Positive Rate", fontsize=14)
119
plt.ylabel("True Positive Rate", fontsize=14)
120
ax.tick_params(axis='both', which='major', labelsize=14)
121
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/crossval_roc_mean_only.pdf")
122
plt.close('all')
123
124
125
res=pd.DataFrame({"c":best_c, "l1":best_l1, "max auc":max_auc, "valid auc":valid_auc})
126
print(res)
127
print(statistics.mean(max_auc))
128
print(statistics.mean(valid_auc))
129
130
#%%
131
#               PART2: retrain model on whole dataset, predict G2
132
import os
133
os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus")
134
135
import dill                          
136
dill.load_session('globalsave_part1.pkl')
137
138
139
features = pd.read_table('uterus_rnaseq_VST.txt', sep="\t", index_col=0)
140
features = features[features['label']!="G2"]
141
features=features.replace("G1",0)
142
features=features.replace("G3",1)
143
print(features["label"].value_counts())
144
145
labels = np.array(features['label'])
146
features= features.drop('label', axis = 1)
147
148
X_train, X_test, y_train, y_test = train_test_split(features, labels,
149
                                                    test_size = test_size, random_state = random_state)
150
X_train=pd.DataFrame(X_train)
151
X_test=pd.DataFrame(X_test)
152
print([X_train.shape,y_train.shape,X_test.shape,y_test.shape])
153
154
best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet", 
155
                          fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000, 
156
                          l1_ratios=l1_ratios)
157
best_model.fit(X_train,y_train)
158
print("Best C: ", best_model.C_)
159
print("Best l1_ratio: ", best_model.l1_ratio_)
160
print ('Max auc_roc:', best_model.scores_[1].mean(axis=0).max())
161
162
dill.dump_session('globalsave_part2.pkl')
163
164
y_pred = best_model.predict(X_train)
165
y_pred_proba=best_model.predict_proba(X_train)[:,1]
166
fpr, tpr, thresholds = roc_curve(y_train, y_pred_proba, drop_intermediate=True)
167
optimal_idx = np.argmax(tpr - fpr)
168
optimal_threshold = thresholds[optimal_idx]
169
print(confusion_matrix(y_train,y_pred))
170
print(classification_report(y_train,y_pred))
171
print("Accuracy train: ", accuracy_score(y_train, y_pred))
172
print("AUC train: ", roc_auc_score(y_train, y_pred_proba))
173
print("Threshold value is:", optimal_threshold)
174
for i in range(len(y_pred)):
175
    if y_pred_proba[i]<optimal_threshold:
176
        y_pred[i]=0
177
    else: y_pred[i]=1
178
179
print(confusion_matrix(y_train,y_pred))
180
print(classification_report(y_train,y_pred))
181
print("Accuracy train after thresholding: ", accuracy_score(y_train, y_pred))
182
183
184
y_pred = best_model.predict(X_test)
185
y_pred_proba=best_model.predict_proba(X_test)[:,1]
186
print(confusion_matrix(y_test,y_pred))
187
print(classification_report(y_test,y_pred))
188
print("Accuracy test: ", accuracy_score(y_test, y_pred))
189
print("AUC test: ", roc_auc_score(y_test, y_pred_proba))
190
for i in range(len(y_pred)):
191
    if y_pred_proba[i]<optimal_threshold:
192
        y_pred[i]=0
193
    else: y_pred[i]=1
194
195
print(confusion_matrix(y_test,y_pred))
196
print(classification_report(y_test,y_pred))
197
print("Accuracy test after thresholding: ", accuracy_score(y_test, y_pred))
198
199
200
y_pred_proba=best_model.predict_proba(X_test)[:,1]
201
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_proba)
202
roc_auc = metrics.auc(fpr, tpr)
203
204
fig, ax = plt.subplots()
205
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
206
ax.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
207
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic")
208
ax.legend(loc = 'lower right')
209
plt.xlabel("False Positive Rate", fontsize=14)
210
plt.ylabel("True Positive Rate", fontsize=14)
211
ax.tick_params(axis='both', which='major', labelsize=14)
212
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/test_roc.pdf")
213
plt.close('all')
214
215
216
217
features = pd.read_table('uterus_rnaseq_VST_G2.txt', sep="\t", index_col=0)
218
print(features["label"].value_counts())
219
y_test = np.array(features['label'])
220
X_test= features.drop('label', axis = 1)
221
222
y_pred_proba=pd.DataFrame(best_model.predict_proba(X_test)[:,1])
223
y_pred_proba.columns=["pred_proba"]
224
y_pred_proba["samples"]=X_test.index.values
225
y_pred_proba.to_csv("/data10/working_groups/balint_group/gargya.peter/R/uterus/G2_preds.txt", sep='\t',
226
                    index=False)
227
228
#%%
229
#                 PART3: important overlapping genes during CV-s
230
import os
231
os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus")
232
233
import dill                          
234
dill.load_session('globalsave_part1.pkl')
235
236
big_model_AUCs=valid_auc
237
coefs_array=coefs_array/100
238
total_coefs=np.absolute(coefs_array).sum()
239
coefs_pd=pd.DataFrame(data=np.absolute(coefs_array), columns=X_train.columns.values).transpose().sort_values(0, axis=0, ascending=False)
240
241
toplot=pd.DataFrame(data=coefs_array, columns=X_train.columns.values).transpose()
242
toplot.columns=["coefs"]
243
toplot["abs"]=np.absolute(coefs_array).transpose()
244
toplot=toplot.sort_values("abs", axis=0, ascending=False).head(12)
245
toplot.to_csv("to_plot_barchart.txt", index=True, sep="\t")
246
247
ori_X_train=pd.DataFrame(X_train)
248
ori_X_test=pd.DataFrame(X_test)
249
loop_auc=[]
250
loop_auc_test=[]
251
loop_pvals=[]
252
loop_cum_coef=[]
253
254
for i in range(1,200):
255
    valid_auc=[]
256
    valid_auc_test=[]
257
    gene_mask=coefs_pd.index.values[:i]
258
    X_train=ori_X_train[gene_mask]
259
    X_test=ori_X_test[gene_mask]
260
                                
261
    for train_idx, val_idx in tqdm(cv_outer.split(X_train, y_train)):
262
        train_data, val_data = X_train.iloc[train_idx], X_train.iloc[val_idx]
263
        train_target, val_target = y_train[train_idx], y_train[val_idx]
264
        
265
        best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet",
266
                          fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000, 
267
                          l1_ratios=l1_ratios)
268
        best_model.fit(train_data, train_target)
269
        
270
        y_pred_prob = best_model.predict_proba(val_data)[:,1]
271
        auc = roc_auc_score(val_target, y_pred_prob)
272
        valid_auc.append(auc)
273
        mean_auc=statistics.mean(valid_auc)
274
        
275
        best_model.fit(X_train, y_train)
276
        y_pred_prob = best_model.predict_proba(X_test)[:,1]
277
        auc_test = roc_auc_score(y_test, y_pred_prob)
278
        valid_auc_test.append(auc_test)
279
        mean_auc_test=statistics.mean(valid_auc_test)
280
        
281
    loop_auc.append(mean_auc)
282
    loop_auc_test.append(mean_auc_test)
283
    loop_pvals.append(scipy.stats.wilcoxon(big_model_AUCs, valid_auc, alternative="two-sided").pvalue)
284
    loop_cum_coef.append(coefs_pd[0][0:i].sum())
285
286
287
dill.dump_session('globalsave_part3.pkl')
288
289
data=pd.DataFrame(loop_auc, columns=["auc"])
290
data["auc_test"]=loop_auc_test
291
data["pval"]=loop_pvals
292
data["cum_coef"]=loop_cum_coef
293
data.to_csv("results.txt", index=False, sep="\t")
294
295
fig, ax = plt.subplots()
296
ax.plot(range(1,200), loop_auc, "brown", label="Mean AUCs")
297
ax.plot(range(1,200), loop_auc_test, "green", label="Test AUCs")
298
plt.ylabel('AUC scores', fontsize=14)
299
plt.xlabel('Number of genes', fontsize=14)
300
plt.legend()
301
plt.title("Results of iterative analysis")
302
ax.tick_params(axis='both', which='major', labelsize=14)
303
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/loop_AUCs_min_num_genes.pdf")
304
plt.close('all')
305
306
307
308
#%%
309
#                   PART4: results with min num genes
310
import os
311
os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus")
312
313
import dill                          
314
dill.load_session('globalsave_part1.pkl')
315
316
big_model_AUCs=valid_auc
317
coefs_pd=pd.DataFrame(data=np.absolute(coefs_array), columns=X_train.columns.values).transpose().sort_values(0, axis=0, ascending=False)
318
319
gene_mask=coefs_pd.index.values[:12]
320
print(gene_mask)
321
X_train=X_train[gene_mask]
322
X_test=X_test[gene_mask]
323
324
best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet",
325
                                  fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000, 
326
                                  l1_ratios=l1_ratios)
327
best_model.fit(X_train, y_train)
328
print("Best C: ", best_model.C_)
329
print("Best l1_ratio: ", best_model.l1_ratio_)
330
print ('Max auc_roc:', best_model.scores_[1].mean(axis=0).max())
331
332
333
y_pred = best_model.predict(X_train)
334
y_pred_proba=best_model.predict_proba(X_train)[:,1]
335
fpr, tpr, thresholds = roc_curve(y_train, y_pred_proba, drop_intermediate=True)
336
optimal_idx = np.argmax(tpr - fpr)
337
optimal_threshold = thresholds[optimal_idx]
338
print(confusion_matrix(y_train,y_pred))
339
print(classification_report(y_train,y_pred))
340
print("Accuracy train: ", accuracy_score(y_train, y_pred))
341
print("AUC train: ", roc_auc_score(y_train, y_pred_proba))
342
print("Threshold value is:", optimal_threshold)
343
for i in range(len(y_pred)):
344
    if y_pred_proba[i]<optimal_threshold:
345
        y_pred[i]=0
346
    else: y_pred[i]=1
347
348
print(confusion_matrix(y_train,y_pred))
349
print(classification_report(y_train,y_pred))
350
print("Accuracy train after thresholding: ", accuracy_score(y_train, y_pred))
351
352
353
train_pred_proba=pd.DataFrame(y_pred_proba)
354
train_pred_proba["samples"]=X_train.index.values
355
356
357
y_pred = best_model.predict(X_test)
358
y_pred_proba=best_model.predict_proba(X_test)[:,1]
359
print(confusion_matrix(y_test,y_pred))
360
print(classification_report(y_test,y_pred))
361
print("Accuracy test: ", accuracy_score(y_test, y_pred))
362
print("AUC test: ", roc_auc_score(y_test, y_pred_proba))
363
for i in range(len(y_pred)):
364
    if y_pred_proba[i]<optimal_threshold:
365
        y_pred[i]=0
366
    else: y_pred[i]=1
367
368
print(confusion_matrix(y_test,y_pred))
369
print(classification_report(y_test,y_pred))
370
print("Accuracy test after thresholding: ", accuracy_score(y_test, y_pred))
371
372
373
y_pred_proba=best_model.predict_proba(X_test)[:,1]
374
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_proba)
375
roc_auc = metrics.auc(fpr, tpr)
376
fig, ax = plt.subplots()
377
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
378
ax.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
379
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic")
380
ax.legend(loc = 'lower right')
381
plt.xlabel("False Positive Rate", fontsize=14)
382
plt.ylabel("True Positive Rate", fontsize=14)
383
ax.tick_params(axis='both', which='major', labelsize=14)
384
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/test_roc_with_min_num_genes.pdf")
385
plt.close('all')
386
387
388
features = pd.read_table('uterus_rnaseq_VST_G2.txt', sep="\t", index_col=0)
389
print(features["label"].value_counts())
390
y_test = np.array(features['label'])
391
X_test= features.drop('label', axis = 1)
392
393
X_test=X_test[gene_mask]
394
395
y_pred_proba=pd.DataFrame(best_model.predict_proba(X_test)[:,1])
396
y_pred_proba.columns=["pred_proba"]
397
y_pred_proba["samples"]=X_test.index.values
398
y_pred_proba.to_csv("/data10/working_groups/balint_group/gargya.peter/R/uterus/G2_preds_with_mingenes.txt", sep='\t',
399
                    index=False)