|
a |
|
b/part2_Python_ML.py |
|
|
1 |
# -*- coding: utf-8 -*- |
|
|
2 |
""" |
|
|
3 |
Created on Fri May 28 13:53:46 2021 |
|
|
4 |
|
|
|
5 |
@author: Peter |
|
|
6 |
""" |
|
|
7 |
# -*- coding: utf-8 -*- |
|
|
8 |
""" |
|
|
9 |
Created on Thu Oct 29 21:12:19 2020 |
|
|
10 |
|
|
|
11 |
@author: Peter |
|
|
12 |
""" |
|
|
13 |
# PART1: nested-cv elasticnet |
|
|
14 |
import os |
|
|
15 |
import pandas as pd |
|
|
16 |
import numpy as np |
|
|
17 |
from sklearn.model_selection import train_test_split |
|
|
18 |
from sklearn.model_selection import StratifiedShuffleSplit |
|
|
19 |
from sklearn.linear_model import LogisticRegressionCV |
|
|
20 |
from tqdm import tqdm |
|
|
21 |
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score |
|
|
22 |
import matplotlib.pyplot as plt |
|
|
23 |
from sklearn.metrics import plot_roc_curve |
|
|
24 |
from sklearn import metrics |
|
|
25 |
from sklearn.feature_selection import SelectFromModel |
|
|
26 |
import dill |
|
|
27 |
import statistics |
|
|
28 |
from sklearn.metrics import roc_curve |
|
|
29 |
import scipy |
|
|
30 |
|
|
|
31 |
szempont="roc_auc" |
|
|
32 |
random_state=1 |
|
|
33 |
cv_outer = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=random_state) |
|
|
34 |
cv=5 |
|
|
35 |
test_size=0.2 |
|
|
36 |
l1_ratios=[0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9] |
|
|
37 |
|
|
|
38 |
os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus") |
|
|
39 |
|
|
|
40 |
features = pd.read_table('uterus_rnaseq_VST.txt', sep="\t", index_col=0) |
|
|
41 |
features = features[features['label']!="G2"] |
|
|
42 |
features=features.replace("G1",0) |
|
|
43 |
features=features.replace("G3",1) |
|
|
44 |
print(features["label"].value_counts()) |
|
|
45 |
|
|
|
46 |
labels = np.array(features['label']) |
|
|
47 |
|
|
|
48 |
features= features.drop('label', axis = 1) |
|
|
49 |
X_train, X_test, y_train, y_test = train_test_split(features, labels, |
|
|
50 |
test_size = test_size, random_state = random_state) |
|
|
51 |
X_train=pd.DataFrame(X_train) |
|
|
52 |
X_test=pd.DataFrame(X_test) |
|
|
53 |
print([X_train.shape,y_train.shape,X_test.shape,y_test.shape]) |
|
|
54 |
|
|
|
55 |
best_c=[] |
|
|
56 |
best_l1=[] |
|
|
57 |
max_auc=[] |
|
|
58 |
valid_auc=[] |
|
|
59 |
tprs = [] |
|
|
60 |
aucs = [] |
|
|
61 |
mean_fpr = np.linspace(0, 1, 100) |
|
|
62 |
fig, ax = plt.subplots() |
|
|
63 |
i=1 |
|
|
64 |
genes=[] |
|
|
65 |
|
|
|
66 |
for train_idx, val_idx in tqdm(cv_outer.split(X_train, y_train)): |
|
|
67 |
train_data, val_data = X_train.iloc[train_idx], X_train.iloc[val_idx] |
|
|
68 |
train_target, val_target = y_train[train_idx], y_train[val_idx] |
|
|
69 |
|
|
|
70 |
best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet", |
|
|
71 |
fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000, |
|
|
72 |
l1_ratios=l1_ratios) |
|
|
73 |
best_model.fit(train_data, train_target) |
|
|
74 |
|
|
|
75 |
model = SelectFromModel(best_model, prefit=True, threshold=None) |
|
|
76 |
mask=model.get_support() |
|
|
77 |
train_data_genes = train_data.loc[:, mask] |
|
|
78 |
[genes.append(x) for x in train_data_genes.columns.values] |
|
|
79 |
coefs_array=+ best_model.coef_ |
|
|
80 |
|
|
|
81 |
y_pred_prob = best_model.predict_proba(val_data)[:,1] |
|
|
82 |
valid_auc.append(roc_auc_score(val_target, y_pred_prob)) |
|
|
83 |
best_c.append(best_model.C_) |
|
|
84 |
best_l1.append(best_model.l1_ratio_) |
|
|
85 |
max_auc.append(best_model.scores_[1].mean(axis=0).max()) |
|
|
86 |
|
|
|
87 |
viz = plot_roc_curve(best_model, val_data, val_target, |
|
|
88 |
name='ROC fold {}'.format(i), alpha=0.5, lw=1, ax=ax) |
|
|
89 |
interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr) |
|
|
90 |
interp_tpr[0] = 0.0 |
|
|
91 |
tprs.append(interp_tpr) |
|
|
92 |
aucs.append(viz.roc_auc) |
|
|
93 |
i=i+1 |
|
|
94 |
|
|
|
95 |
|
|
|
96 |
dill.dump_session('globalsave_part1.pkl') |
|
|
97 |
|
|
|
98 |
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8) |
|
|
99 |
mean_tpr = np.mean(tprs, axis=0) |
|
|
100 |
mean_tpr[-1] = 1.0 |
|
|
101 |
mean_auc = metrics.auc(mean_fpr, mean_tpr) |
|
|
102 |
print("Mean AUC: ", mean_auc) |
|
|
103 |
std_auc = np.std(aucs) |
|
|
104 |
ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) |
|
|
105 |
std_tpr = np.std(tprs, axis=0) |
|
|
106 |
tprs_upper = np.minimum(mean_tpr + std_tpr, 1) |
|
|
107 |
tprs_lower = np.maximum(mean_tpr - std_tpr, 0) |
|
|
108 |
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') |
|
|
109 |
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic") |
|
|
110 |
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/crossval_roc.pdf") |
|
|
111 |
|
|
|
112 |
fig, ax = plt.subplots() |
|
|
113 |
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8) |
|
|
114 |
ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) |
|
|
115 |
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') |
|
|
116 |
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic") |
|
|
117 |
ax.legend(loc="lower right") |
|
|
118 |
plt.xlabel("False Positive Rate", fontsize=14) |
|
|
119 |
plt.ylabel("True Positive Rate", fontsize=14) |
|
|
120 |
ax.tick_params(axis='both', which='major', labelsize=14) |
|
|
121 |
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/crossval_roc_mean_only.pdf") |
|
|
122 |
plt.close('all') |
|
|
123 |
|
|
|
124 |
|
|
|
125 |
res=pd.DataFrame({"c":best_c, "l1":best_l1, "max auc":max_auc, "valid auc":valid_auc}) |
|
|
126 |
print(res) |
|
|
127 |
print(statistics.mean(max_auc)) |
|
|
128 |
print(statistics.mean(valid_auc)) |
|
|
129 |
|
|
|
130 |
#%% |
|
|
131 |
# PART2: retrain model on whole dataset, predict G2 |
|
|
132 |
import os |
|
|
133 |
os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus") |
|
|
134 |
|
|
|
135 |
import dill |
|
|
136 |
dill.load_session('globalsave_part1.pkl') |
|
|
137 |
|
|
|
138 |
|
|
|
139 |
features = pd.read_table('uterus_rnaseq_VST.txt', sep="\t", index_col=0) |
|
|
140 |
features = features[features['label']!="G2"] |
|
|
141 |
features=features.replace("G1",0) |
|
|
142 |
features=features.replace("G3",1) |
|
|
143 |
print(features["label"].value_counts()) |
|
|
144 |
|
|
|
145 |
labels = np.array(features['label']) |
|
|
146 |
features= features.drop('label', axis = 1) |
|
|
147 |
|
|
|
148 |
X_train, X_test, y_train, y_test = train_test_split(features, labels, |
|
|
149 |
test_size = test_size, random_state = random_state) |
|
|
150 |
X_train=pd.DataFrame(X_train) |
|
|
151 |
X_test=pd.DataFrame(X_test) |
|
|
152 |
print([X_train.shape,y_train.shape,X_test.shape,y_test.shape]) |
|
|
153 |
|
|
|
154 |
best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet", |
|
|
155 |
fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000, |
|
|
156 |
l1_ratios=l1_ratios) |
|
|
157 |
best_model.fit(X_train,y_train) |
|
|
158 |
print("Best C: ", best_model.C_) |
|
|
159 |
print("Best l1_ratio: ", best_model.l1_ratio_) |
|
|
160 |
print ('Max auc_roc:', best_model.scores_[1].mean(axis=0).max()) |
|
|
161 |
|
|
|
162 |
dill.dump_session('globalsave_part2.pkl') |
|
|
163 |
|
|
|
164 |
y_pred = best_model.predict(X_train) |
|
|
165 |
y_pred_proba=best_model.predict_proba(X_train)[:,1] |
|
|
166 |
fpr, tpr, thresholds = roc_curve(y_train, y_pred_proba, drop_intermediate=True) |
|
|
167 |
optimal_idx = np.argmax(tpr - fpr) |
|
|
168 |
optimal_threshold = thresholds[optimal_idx] |
|
|
169 |
print(confusion_matrix(y_train,y_pred)) |
|
|
170 |
print(classification_report(y_train,y_pred)) |
|
|
171 |
print("Accuracy train: ", accuracy_score(y_train, y_pred)) |
|
|
172 |
print("AUC train: ", roc_auc_score(y_train, y_pred_proba)) |
|
|
173 |
print("Threshold value is:", optimal_threshold) |
|
|
174 |
for i in range(len(y_pred)): |
|
|
175 |
if y_pred_proba[i]<optimal_threshold: |
|
|
176 |
y_pred[i]=0 |
|
|
177 |
else: y_pred[i]=1 |
|
|
178 |
|
|
|
179 |
print(confusion_matrix(y_train,y_pred)) |
|
|
180 |
print(classification_report(y_train,y_pred)) |
|
|
181 |
print("Accuracy train after thresholding: ", accuracy_score(y_train, y_pred)) |
|
|
182 |
|
|
|
183 |
|
|
|
184 |
y_pred = best_model.predict(X_test) |
|
|
185 |
y_pred_proba=best_model.predict_proba(X_test)[:,1] |
|
|
186 |
print(confusion_matrix(y_test,y_pred)) |
|
|
187 |
print(classification_report(y_test,y_pred)) |
|
|
188 |
print("Accuracy test: ", accuracy_score(y_test, y_pred)) |
|
|
189 |
print("AUC test: ", roc_auc_score(y_test, y_pred_proba)) |
|
|
190 |
for i in range(len(y_pred)): |
|
|
191 |
if y_pred_proba[i]<optimal_threshold: |
|
|
192 |
y_pred[i]=0 |
|
|
193 |
else: y_pred[i]=1 |
|
|
194 |
|
|
|
195 |
print(confusion_matrix(y_test,y_pred)) |
|
|
196 |
print(classification_report(y_test,y_pred)) |
|
|
197 |
print("Accuracy test after thresholding: ", accuracy_score(y_test, y_pred)) |
|
|
198 |
|
|
|
199 |
|
|
|
200 |
y_pred_proba=best_model.predict_proba(X_test)[:,1] |
|
|
201 |
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_proba) |
|
|
202 |
roc_auc = metrics.auc(fpr, tpr) |
|
|
203 |
|
|
|
204 |
fig, ax = plt.subplots() |
|
|
205 |
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8) |
|
|
206 |
ax.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) |
|
|
207 |
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic") |
|
|
208 |
ax.legend(loc = 'lower right') |
|
|
209 |
plt.xlabel("False Positive Rate", fontsize=14) |
|
|
210 |
plt.ylabel("True Positive Rate", fontsize=14) |
|
|
211 |
ax.tick_params(axis='both', which='major', labelsize=14) |
|
|
212 |
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/test_roc.pdf") |
|
|
213 |
plt.close('all') |
|
|
214 |
|
|
|
215 |
|
|
|
216 |
|
|
|
217 |
features = pd.read_table('uterus_rnaseq_VST_G2.txt', sep="\t", index_col=0) |
|
|
218 |
print(features["label"].value_counts()) |
|
|
219 |
y_test = np.array(features['label']) |
|
|
220 |
X_test= features.drop('label', axis = 1) |
|
|
221 |
|
|
|
222 |
y_pred_proba=pd.DataFrame(best_model.predict_proba(X_test)[:,1]) |
|
|
223 |
y_pred_proba.columns=["pred_proba"] |
|
|
224 |
y_pred_proba["samples"]=X_test.index.values |
|
|
225 |
y_pred_proba.to_csv("/data10/working_groups/balint_group/gargya.peter/R/uterus/G2_preds.txt", sep='\t', |
|
|
226 |
index=False) |
|
|
227 |
|
|
|
228 |
#%% |
|
|
229 |
# PART3: important overlapping genes during CV-s |
|
|
230 |
import os |
|
|
231 |
os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus") |
|
|
232 |
|
|
|
233 |
import dill |
|
|
234 |
dill.load_session('globalsave_part1.pkl') |
|
|
235 |
|
|
|
236 |
big_model_AUCs=valid_auc |
|
|
237 |
coefs_array=coefs_array/100 |
|
|
238 |
total_coefs=np.absolute(coefs_array).sum() |
|
|
239 |
coefs_pd=pd.DataFrame(data=np.absolute(coefs_array), columns=X_train.columns.values).transpose().sort_values(0, axis=0, ascending=False) |
|
|
240 |
|
|
|
241 |
toplot=pd.DataFrame(data=coefs_array, columns=X_train.columns.values).transpose() |
|
|
242 |
toplot.columns=["coefs"] |
|
|
243 |
toplot["abs"]=np.absolute(coefs_array).transpose() |
|
|
244 |
toplot=toplot.sort_values("abs", axis=0, ascending=False).head(12) |
|
|
245 |
toplot.to_csv("to_plot_barchart.txt", index=True, sep="\t") |
|
|
246 |
|
|
|
247 |
ori_X_train=pd.DataFrame(X_train) |
|
|
248 |
ori_X_test=pd.DataFrame(X_test) |
|
|
249 |
loop_auc=[] |
|
|
250 |
loop_auc_test=[] |
|
|
251 |
loop_pvals=[] |
|
|
252 |
loop_cum_coef=[] |
|
|
253 |
|
|
|
254 |
for i in range(1,200): |
|
|
255 |
valid_auc=[] |
|
|
256 |
valid_auc_test=[] |
|
|
257 |
gene_mask=coefs_pd.index.values[:i] |
|
|
258 |
X_train=ori_X_train[gene_mask] |
|
|
259 |
X_test=ori_X_test[gene_mask] |
|
|
260 |
|
|
|
261 |
for train_idx, val_idx in tqdm(cv_outer.split(X_train, y_train)): |
|
|
262 |
train_data, val_data = X_train.iloc[train_idx], X_train.iloc[val_idx] |
|
|
263 |
train_target, val_target = y_train[train_idx], y_train[val_idx] |
|
|
264 |
|
|
|
265 |
best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet", |
|
|
266 |
fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000, |
|
|
267 |
l1_ratios=l1_ratios) |
|
|
268 |
best_model.fit(train_data, train_target) |
|
|
269 |
|
|
|
270 |
y_pred_prob = best_model.predict_proba(val_data)[:,1] |
|
|
271 |
auc = roc_auc_score(val_target, y_pred_prob) |
|
|
272 |
valid_auc.append(auc) |
|
|
273 |
mean_auc=statistics.mean(valid_auc) |
|
|
274 |
|
|
|
275 |
best_model.fit(X_train, y_train) |
|
|
276 |
y_pred_prob = best_model.predict_proba(X_test)[:,1] |
|
|
277 |
auc_test = roc_auc_score(y_test, y_pred_prob) |
|
|
278 |
valid_auc_test.append(auc_test) |
|
|
279 |
mean_auc_test=statistics.mean(valid_auc_test) |
|
|
280 |
|
|
|
281 |
loop_auc.append(mean_auc) |
|
|
282 |
loop_auc_test.append(mean_auc_test) |
|
|
283 |
loop_pvals.append(scipy.stats.wilcoxon(big_model_AUCs, valid_auc, alternative="two-sided").pvalue) |
|
|
284 |
loop_cum_coef.append(coefs_pd[0][0:i].sum()) |
|
|
285 |
|
|
|
286 |
|
|
|
287 |
dill.dump_session('globalsave_part3.pkl') |
|
|
288 |
|
|
|
289 |
data=pd.DataFrame(loop_auc, columns=["auc"]) |
|
|
290 |
data["auc_test"]=loop_auc_test |
|
|
291 |
data["pval"]=loop_pvals |
|
|
292 |
data["cum_coef"]=loop_cum_coef |
|
|
293 |
data.to_csv("results.txt", index=False, sep="\t") |
|
|
294 |
|
|
|
295 |
fig, ax = plt.subplots() |
|
|
296 |
ax.plot(range(1,200), loop_auc, "brown", label="Mean AUCs") |
|
|
297 |
ax.plot(range(1,200), loop_auc_test, "green", label="Test AUCs") |
|
|
298 |
plt.ylabel('AUC scores', fontsize=14) |
|
|
299 |
plt.xlabel('Number of genes', fontsize=14) |
|
|
300 |
plt.legend() |
|
|
301 |
plt.title("Results of iterative analysis") |
|
|
302 |
ax.tick_params(axis='both', which='major', labelsize=14) |
|
|
303 |
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/loop_AUCs_min_num_genes.pdf") |
|
|
304 |
plt.close('all') |
|
|
305 |
|
|
|
306 |
|
|
|
307 |
|
|
|
308 |
#%% |
|
|
309 |
# PART4: results with min num genes |
|
|
310 |
import os |
|
|
311 |
os.chdir("/data10/working_groups/balint_group/gargya.peter/R/uterus") |
|
|
312 |
|
|
|
313 |
import dill |
|
|
314 |
dill.load_session('globalsave_part1.pkl') |
|
|
315 |
|
|
|
316 |
big_model_AUCs=valid_auc |
|
|
317 |
coefs_pd=pd.DataFrame(data=np.absolute(coefs_array), columns=X_train.columns.values).transpose().sort_values(0, axis=0, ascending=False) |
|
|
318 |
|
|
|
319 |
gene_mask=coefs_pd.index.values[:12] |
|
|
320 |
print(gene_mask) |
|
|
321 |
X_train=X_train[gene_mask] |
|
|
322 |
X_test=X_test[gene_mask] |
|
|
323 |
|
|
|
324 |
best_model = LogisticRegressionCV(random_state=random_state, cv=cv, scoring=szempont, penalty="elasticnet", |
|
|
325 |
fit_intercept=True, solver="saga", Cs=10, n_jobs=15, max_iter=20000, |
|
|
326 |
l1_ratios=l1_ratios) |
|
|
327 |
best_model.fit(X_train, y_train) |
|
|
328 |
print("Best C: ", best_model.C_) |
|
|
329 |
print("Best l1_ratio: ", best_model.l1_ratio_) |
|
|
330 |
print ('Max auc_roc:', best_model.scores_[1].mean(axis=0).max()) |
|
|
331 |
|
|
|
332 |
|
|
|
333 |
y_pred = best_model.predict(X_train) |
|
|
334 |
y_pred_proba=best_model.predict_proba(X_train)[:,1] |
|
|
335 |
fpr, tpr, thresholds = roc_curve(y_train, y_pred_proba, drop_intermediate=True) |
|
|
336 |
optimal_idx = np.argmax(tpr - fpr) |
|
|
337 |
optimal_threshold = thresholds[optimal_idx] |
|
|
338 |
print(confusion_matrix(y_train,y_pred)) |
|
|
339 |
print(classification_report(y_train,y_pred)) |
|
|
340 |
print("Accuracy train: ", accuracy_score(y_train, y_pred)) |
|
|
341 |
print("AUC train: ", roc_auc_score(y_train, y_pred_proba)) |
|
|
342 |
print("Threshold value is:", optimal_threshold) |
|
|
343 |
for i in range(len(y_pred)): |
|
|
344 |
if y_pred_proba[i]<optimal_threshold: |
|
|
345 |
y_pred[i]=0 |
|
|
346 |
else: y_pred[i]=1 |
|
|
347 |
|
|
|
348 |
print(confusion_matrix(y_train,y_pred)) |
|
|
349 |
print(classification_report(y_train,y_pred)) |
|
|
350 |
print("Accuracy train after thresholding: ", accuracy_score(y_train, y_pred)) |
|
|
351 |
|
|
|
352 |
|
|
|
353 |
train_pred_proba=pd.DataFrame(y_pred_proba) |
|
|
354 |
train_pred_proba["samples"]=X_train.index.values |
|
|
355 |
|
|
|
356 |
|
|
|
357 |
y_pred = best_model.predict(X_test) |
|
|
358 |
y_pred_proba=best_model.predict_proba(X_test)[:,1] |
|
|
359 |
print(confusion_matrix(y_test,y_pred)) |
|
|
360 |
print(classification_report(y_test,y_pred)) |
|
|
361 |
print("Accuracy test: ", accuracy_score(y_test, y_pred)) |
|
|
362 |
print("AUC test: ", roc_auc_score(y_test, y_pred_proba)) |
|
|
363 |
for i in range(len(y_pred)): |
|
|
364 |
if y_pred_proba[i]<optimal_threshold: |
|
|
365 |
y_pred[i]=0 |
|
|
366 |
else: y_pred[i]=1 |
|
|
367 |
|
|
|
368 |
print(confusion_matrix(y_test,y_pred)) |
|
|
369 |
print(classification_report(y_test,y_pred)) |
|
|
370 |
print("Accuracy test after thresholding: ", accuracy_score(y_test, y_pred)) |
|
|
371 |
|
|
|
372 |
|
|
|
373 |
y_pred_proba=best_model.predict_proba(X_test)[:,1] |
|
|
374 |
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_proba) |
|
|
375 |
roc_auc = metrics.auc(fpr, tpr) |
|
|
376 |
fig, ax = plt.subplots() |
|
|
377 |
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8) |
|
|
378 |
ax.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) |
|
|
379 |
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic") |
|
|
380 |
ax.legend(loc = 'lower right') |
|
|
381 |
plt.xlabel("False Positive Rate", fontsize=14) |
|
|
382 |
plt.ylabel("True Positive Rate", fontsize=14) |
|
|
383 |
ax.tick_params(axis='both', which='major', labelsize=14) |
|
|
384 |
plt.savefig("/data10/working_groups/balint_group/gargya.peter/R/uterus/test_roc_with_min_num_genes.pdf") |
|
|
385 |
plt.close('all') |
|
|
386 |
|
|
|
387 |
|
|
|
388 |
features = pd.read_table('uterus_rnaseq_VST_G2.txt', sep="\t", index_col=0) |
|
|
389 |
print(features["label"].value_counts()) |
|
|
390 |
y_test = np.array(features['label']) |
|
|
391 |
X_test= features.drop('label', axis = 1) |
|
|
392 |
|
|
|
393 |
X_test=X_test[gene_mask] |
|
|
394 |
|
|
|
395 |
y_pred_proba=pd.DataFrame(best_model.predict_proba(X_test)[:,1]) |
|
|
396 |
y_pred_proba.columns=["pred_proba"] |
|
|
397 |
y_pred_proba["samples"]=X_test.index.values |
|
|
398 |
y_pred_proba.to_csv("/data10/working_groups/balint_group/gargya.peter/R/uterus/G2_preds_with_mingenes.txt", sep='\t', |
|
|
399 |
index=False) |