|
a |
|
b/AttentionMOI/src/utils.py |
|
|
1 |
import os, sys, time, torch, gzip |
|
|
2 |
import numpy as np |
|
|
3 |
import pandas as pd |
|
|
4 |
from sklearn.metrics import f1_score, recall_score, roc_auc_score, precision_score, accuracy_score |
|
|
5 |
from captum.attr import IntegratedGradients |
|
|
6 |
from sklearn.impute import KNNImputer |
|
|
7 |
|
|
|
8 |
|
|
|
9 |
def clin_read_tsi(in_file, out_file, task, threshold=2): |
|
|
10 |
"""To process raw clinical dataset download from linkedomics, which is .tsi format. |
|
|
11 |
|
|
|
12 |
Args: |
|
|
13 |
in_file (string): The raw clinical dataset file, end with .tsi, a tab delinimated file. |
|
|
14 |
out_file (string): Output file name. A .csv format file |
|
|
15 |
task(string): Define task type. Could be one of the following tasks: LST, pan-class |
|
|
16 |
threshold (int, optional): a threshold year using to split patients into LTS and Non-LTS. Defaults to 2. |
|
|
17 |
Return: |
|
|
18 |
return .csv format file. |
|
|
19 |
""" |
|
|
20 |
df_clin = pd.read_table(in_file, index_col=0) |
|
|
21 |
# format the table |
|
|
22 |
df_clin = df_clin.T |
|
|
23 |
df_clin = df_clin[~df_clin.overall_survival.isna()] |
|
|
24 |
df_clin = df_clin[~df_clin.status.isna()] |
|
|
25 |
df_clin.overall_survival = df_clin.overall_survival.astype('int') |
|
|
26 |
df_clin.status = df_clin.status.astype('int') |
|
|
27 |
if task=="LST": |
|
|
28 |
# label patients |
|
|
29 |
df_clin.loc[(df_clin.overall_survival < threshold*365) & (df_clin.status == 1), 'label'] = 1 |
|
|
30 |
df_clin.loc[(df_clin.overall_survival >= threshold*365), 'label'] = 0 |
|
|
31 |
df_clin = df_clin[~df_clin.label.isna()] |
|
|
32 |
else: |
|
|
33 |
# label patients |
|
|
34 |
value_map = dict((v, i) for i, v in enumerate(pd.unique(df_clin["histological_type"]))) |
|
|
35 |
print("\nLabels are encoded in to {} categories, encoding dictionary is: {}".format(len(value_map), value_map)) |
|
|
36 |
df_clin["label"] = df_clin["histological_type"] |
|
|
37 |
df_clin = df_clin.replace({"label":value_map}) |
|
|
38 |
# output: clinical file and label file |
|
|
39 |
df_clin.to_csv(out_file + "_clinical.csv", index=True) |
|
|
40 |
label = df_clin[["label"]].astype("int64") |
|
|
41 |
label.to_csv(out_file + "_label.csv", index=True) |
|
|
42 |
|
|
|
43 |
|
|
|
44 |
# clinical feature processing |
|
|
45 |
# add mutation info to clinical feature - CGC genes and total mutated gene number |
|
|
46 |
# one hot encode of features |
|
|
47 |
def process_clin(df_clin, df_mut, outfile, task): |
|
|
48 |
df_mut = df_mut.fillna(0) |
|
|
49 |
df = df_clin.merge(df_mut, left_index=True, right_index=True, how="left") |
|
|
50 |
if task == "LST": |
|
|
51 |
df = df.drop(["overall_survival", "status", "overallsurvival", "label"], axis=1) |
|
|
52 |
categorical = df.columns[(df.dtypes == "object").values].to_list() |
|
|
53 |
df = pd.concat([df.drop(categorical, axis=1), |
|
|
54 |
pd.get_dummies(df[categorical])], |
|
|
55 |
axis=1) |
|
|
56 |
else: |
|
|
57 |
df = df.drop(["histological_type", "label", "overallsurvival"], axis=1) |
|
|
58 |
categorical = df.columns[(df.dtypes == "object").values].to_list() |
|
|
59 |
df = pd.concat([df.drop(categorical, axis=1), |
|
|
60 |
pd.get_dummies(df[categorical])], |
|
|
61 |
axis=1) |
|
|
62 |
df.to_csv(outfile + "_clinical_tmp.csv", index=True) |
|
|
63 |
|
|
|
64 |
|
|
|
65 |
def check_files(files): |
|
|
66 |
"""To check files. |
|
|
67 |
files (str or list) |
|
|
68 |
""" |
|
|
69 |
if isinstance(files, list): |
|
|
70 |
for f in files: |
|
|
71 |
if not os.path.exists(f): |
|
|
72 |
print('[Error] {} not found.'.format(f)) |
|
|
73 |
sys.exit(1) |
|
|
74 |
elif isinstance(files, str): |
|
|
75 |
if not os.path.exists(files): |
|
|
76 |
print('[Error] {} not found.'.format(files)) |
|
|
77 |
sys.exit(1) |
|
|
78 |
else: |
|
|
79 |
print('[Error] {} file path is wrong.'.format(files)) |
|
|
80 |
sys.exit(1) |
|
|
81 |
|
|
|
82 |
|
|
|
83 |
# init log.txt |
|
|
84 |
def init_log(args): |
|
|
85 |
with open(os.path.join(args.outdir, 'log.txt'), 'a') as file: |
|
|
86 |
run_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) |
|
|
87 |
head_line = "Perform model training at {}".format(run_time) |
|
|
88 |
file.writelines(head_line + '\n\n') |
|
|
89 |
if args.FSD: |
|
|
90 |
if args.clin_file: |
|
|
91 |
file.writelines("-------Using FSD and clinical file, feature selection method {}, omic group {} clin, and model {}------------\n".format(args.method, args.omic_name, args.model)) |
|
|
92 |
else: |
|
|
93 |
file.writelines("-------Using FSD, feature selection method {}, omic group {}, and model {}------------\n".format(args.method, args.omic_name, args.model)) |
|
|
94 |
else: |
|
|
95 |
file.writelines("-------No FSD, using feature selection method {}, omic group {}, and model {}------------\n".format(args.method, args.omic_name, args.model)) |
|
|
96 |
file.close() |
|
|
97 |
|
|
|
98 |
# for evaluation |
|
|
99 |
def evaluate(pred_prob, real_label, average="macro"): |
|
|
100 |
# For evaluating binary classification models |
|
|
101 |
if pred_prob.shape[1] == 2: |
|
|
102 |
y_pred = np.argmax(pred_prob, 1) |
|
|
103 |
prec = precision_score(real_label, y_pred) |
|
|
104 |
acc = accuracy_score(real_label, y_pred) |
|
|
105 |
f1 = f1_score(real_label, y_pred) |
|
|
106 |
recall = recall_score(real_label, y_pred) |
|
|
107 |
auc = roc_auc_score(real_label, pred_prob[:, 1]) |
|
|
108 |
# For evaluating multiclass models |
|
|
109 |
else: |
|
|
110 |
y_pred = np.argmax(pred_prob, 1) |
|
|
111 |
prec = precision_score(real_label, y_pred, average='macro') |
|
|
112 |
acc = accuracy_score(real_label, y_pred) |
|
|
113 |
f1 = f1_score(real_label, y_pred, average=average) |
|
|
114 |
recall = recall_score(real_label, y_pred, average=average) |
|
|
115 |
auc = roc_auc_score(real_label, pred_prob, average='macro', multi_class='ovo') |
|
|
116 |
return acc, prec, f1, auc, recall |
|
|
117 |
|
|
|
118 |
|
|
|
119 |
# for explain dnn model |
|
|
120 |
def ig(args, model, dataset, feature_names, omic_group, target=1): |
|
|
121 |
# prepare input data |
|
|
122 |
input_tensor_list = [data for data, labels in dataset] |
|
|
123 |
input_tensor = torch.cat(input_tensor_list, 0) |
|
|
124 |
input_tensor.requires_grad_() |
|
|
125 |
|
|
|
126 |
# instantiation |
|
|
127 |
ig = IntegratedGradients(model) |
|
|
128 |
|
|
|
129 |
# calculating feature importance using IG |
|
|
130 |
attr, _ = ig.attribute(input_tensor, return_convergence_delta=True, target=target) |
|
|
131 |
attr = attr.detach().numpy() |
|
|
132 |
feat_importance = np.mean(attr, axis=0) |
|
|
133 |
|
|
|
134 |
# result |
|
|
135 |
df_imp = pd.DataFrame({'Feature': feature_names, |
|
|
136 |
'Omic': omic_group, |
|
|
137 |
'Target': [target] * len(feature_names), |
|
|
138 |
'Importance_value': feat_importance, |
|
|
139 |
'Importance_value_abs': abs(feat_importance) |
|
|
140 |
}) |
|
|
141 |
df_imp = df_imp.sort_values('Importance_value_abs', ascending=False) |
|
|
142 |
|
|
|
143 |
# output |
|
|
144 |
if args.FSD: |
|
|
145 |
if args.clin_file: |
|
|
146 |
df_imp.to_csv(os.path.join(args.outdir, 'feature_importance_FSD_{}_clin_DNN_{}_target{}.csv'.format(args.method, args.omic_name, target)), index=False) |
|
|
147 |
else: |
|
|
148 |
df_imp.to_csv(os.path.join(args.outdir, 'feature_importance_FSD_{}_DNN_{}_target{}.csv'.format(args.method, args.omic_name, target)),index=False) |
|
|
149 |
else: |
|
|
150 |
if args.clin_file: |
|
|
151 |
df_imp.to_csv(os.path.join(args.outdir, 'feature_importance_{}_clin_DNN_{}_target{}.csv'.format(args.method, args.omic_name, target)), index=False) |
|
|
152 |
else: |
|
|
153 |
df_imp.to_csv(os.path.join(args.outdir, 'feature_importance_{}_DNN_{}_target{}.csv'.format(args.method, args.omic_name, target)),index=False) |
|
|
154 |
|
|
|
155 |
return df_imp |
|
|
156 |
|
|
|
157 |
# for explain net model |
|
|
158 |
def ig_net(args, model, dataset, feature_names, omic_group, target=1): |
|
|
159 |
# prepare input data |
|
|
160 |
input_tensor_dna, input_tensor_rna = [], [] |
|
|
161 |
for data_dna, data_rna, labels in dataset: |
|
|
162 |
input_tensor_dna.append(data_dna) |
|
|
163 |
input_tensor_rna.append(data_rna) |
|
|
164 |
input_tensor_dna = torch.cat(input_tensor_dna, 0).requires_grad_() |
|
|
165 |
input_tensor_rna = torch.cat(input_tensor_rna, 0).requires_grad_() |
|
|
166 |
|
|
|
167 |
# instantiation |
|
|
168 |
ig = IntegratedGradients(model) |
|
|
169 |
|
|
|
170 |
# calculating feature importance using IG |
|
|
171 |
attr, _ = ig.attribute((input_tensor_dna, input_tensor_rna), return_convergence_delta=True, target=target) |
|
|
172 |
feat_importance = [] |
|
|
173 |
for tensor in attr: |
|
|
174 |
tensor = tensor.detach().numpy() |
|
|
175 |
feat_importance.append(np.mean(tensor, axis=0)) |
|
|
176 |
|
|
|
177 |
# result |
|
|
178 |
df_imp = pd.DataFrame({'Feature': feature_names, |
|
|
179 |
'Omic': omic_group, |
|
|
180 |
'Target': [target] * len(feature_names), |
|
|
181 |
'Importance_value': np.concatenate(feat_importance), |
|
|
182 |
'Importance_value_abs': abs(np.concatenate(feat_importance)) |
|
|
183 |
}) |
|
|
184 |
df_imp = df_imp.sort_values('Importance_value_abs', ascending=False) |
|
|
185 |
|
|
|
186 |
# output |
|
|
187 |
if args.FSD: |
|
|
188 |
if args.clin_file: |
|
|
189 |
df_imp.to_csv( |
|
|
190 |
os.path.join(args.outdir, 'feature_importance_FSD_{}_clin_Net_{}_target{}.csv'.format(args.method, args.omic_name, target)), |
|
|
191 |
index=False) |
|
|
192 |
else: |
|
|
193 |
df_imp.to_csv( |
|
|
194 |
os.path.join(args.outdir, 'feature_importance_FSD_{}_Net_{}_target{}.csv'.format(args.method, args.omic_name, target)), |
|
|
195 |
index=False) |
|
|
196 |
else: |
|
|
197 |
if args.clin_file: |
|
|
198 |
df_imp.to_csv( |
|
|
199 |
os.path.join(args.outdir, 'feature_importance_{}_clin_Net_{}_target{}.csv'.format(args.method, args.omic_name, target)), |
|
|
200 |
index=False) |
|
|
201 |
else: |
|
|
202 |
df_imp.to_csv(os.path.join(args.outdir, 'feature_importance_{}_Net_{}_target{}.csv'.format(args.method, args.omic_name, target)), |
|
|
203 |
index=False) |
|
|
204 |
return df_imp |