a b/AttentionMOI/src/utils.py
1
import os, sys, time, torch, gzip
2
import numpy as np
3
import pandas as pd
4
from sklearn.metrics import f1_score, recall_score, roc_auc_score, precision_score, accuracy_score
5
from captum.attr import IntegratedGradients
6
from sklearn.impute import KNNImputer
7
8
9
def clin_read_tsi(in_file, out_file, task, threshold=2):
10
    """To process raw clinical dataset download from linkedomics, which is .tsi format.
11
12
    Args:
13
        in_file (string): The raw clinical dataset file, end with .tsi, a tab delinimated file.
14
        out_file (string): Output file name. A .csv format file
15
        task(string): Define task type. Could be one of the following tasks: LST, pan-class
16
        threshold (int, optional): a threshold year using to split patients into LTS and Non-LTS. Defaults to 2.
17
    Return:
18
        return .csv format file. 
19
    """
20
    df_clin = pd.read_table(in_file, index_col=0)
21
    # format the table
22
    df_clin = df_clin.T
23
    df_clin = df_clin[~df_clin.overall_survival.isna()]
24
    df_clin = df_clin[~df_clin.status.isna()]
25
    df_clin.overall_survival = df_clin.overall_survival.astype('int')
26
    df_clin.status = df_clin.status.astype('int')
27
    if task=="LST":
28
        # label patients
29
        df_clin.loc[(df_clin.overall_survival < threshold*365) & (df_clin.status == 1), 'label'] = 1
30
        df_clin.loc[(df_clin.overall_survival >= threshold*365), 'label'] = 0
31
        df_clin = df_clin[~df_clin.label.isna()]
32
    else:
33
        # label patients
34
        value_map = dict((v, i) for i, v in enumerate(pd.unique(df_clin["histological_type"])))
35
        print("\nLabels are encoded in to {} categories, encoding dictionary is: {}".format(len(value_map), value_map))
36
        df_clin["label"] = df_clin["histological_type"]
37
        df_clin = df_clin.replace({"label":value_map})
38
    # output: clinical file and label file
39
    df_clin.to_csv(out_file + "_clinical.csv", index=True)
40
    label = df_clin[["label"]].astype("int64")
41
    label.to_csv(out_file + "_label.csv", index=True)
42
43
44
# clinical feature processing
45
# add mutation info to clinical feature - CGC genes and total mutated gene number
46
# one hot encode of features
47
def process_clin(df_clin, df_mut, outfile, task):
48
    df_mut = df_mut.fillna(0)
49
    df = df_clin.merge(df_mut, left_index=True, right_index=True, how="left")
50
    if task == "LST":
51
        df = df.drop(["overall_survival", "status", "overallsurvival", "label"], axis=1)
52
        categorical = df.columns[(df.dtypes == "object").values].to_list()
53
        df = pd.concat([df.drop(categorical, axis=1),
54
                        pd.get_dummies(df[categorical])],
55
                       axis=1)
56
    else:
57
        df = df.drop(["histological_type", "label", "overallsurvival"], axis=1)
58
        categorical = df.columns[(df.dtypes == "object").values].to_list()
59
        df = pd.concat([df.drop(categorical, axis=1),
60
                        pd.get_dummies(df[categorical])],
61
                       axis=1)
62
    df.to_csv(outfile + "_clinical_tmp.csv", index=True)
63
64
65
def check_files(files):
66
    """To check files.
67
    files (str or list)
68
    """
69
    if isinstance(files, list):
70
        for f in files:
71
            if not os.path.exists(f):
72
                print('[Error] {} not found.'.format(f))
73
                sys.exit(1)
74
    elif isinstance(files, str):
75
        if not os.path.exists(files):
76
            print('[Error] {} not found.'.format(files))
77
            sys.exit(1)
78
    else:
79
        print('[Error] {} file path is wrong.'.format(files))
80
        sys.exit(1)
81
82
83
# init log.txt
84
def init_log(args):
85
    with open(os.path.join(args.outdir, 'log.txt'), 'a') as file:
86
        run_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
87
        head_line = "Perform model training at {}".format(run_time)
88
        file.writelines(head_line + '\n\n')
89
        if args.FSD:
90
            if args.clin_file:
91
                file.writelines("-------Using FSD and clinical file, feature selection method {}, omic group {} clin, and model {}------------\n".format(args.method, args.omic_name, args.model))
92
            else:
93
                file.writelines("-------Using FSD, feature selection method {}, omic group {}, and model {}------------\n".format(args.method, args.omic_name, args.model))
94
        else:
95
            file.writelines("-------No FSD, using feature selection method {}, omic group {}, and model {}------------\n".format(args.method, args.omic_name, args.model))
96
    file.close()
97
98
# for evaluation
99
def evaluate(pred_prob, real_label, average="macro"):
100
    # For evaluating binary classification models
101
    if pred_prob.shape[1] == 2:
102
        y_pred = np.argmax(pred_prob, 1)
103
        prec = precision_score(real_label, y_pred)
104
        acc = accuracy_score(real_label, y_pred)
105
        f1 = f1_score(real_label, y_pred)
106
        recall = recall_score(real_label, y_pred)
107
        auc = roc_auc_score(real_label, pred_prob[:, 1])
108
    # For evaluating multiclass models
109
    else:
110
        y_pred = np.argmax(pred_prob, 1)
111
        prec = precision_score(real_label, y_pred, average='macro')
112
        acc = accuracy_score(real_label, y_pred)
113
        f1 = f1_score(real_label, y_pred, average=average)
114
        recall = recall_score(real_label, y_pred, average=average)
115
        auc = roc_auc_score(real_label, pred_prob, average='macro', multi_class='ovo')
116
    return acc, prec, f1, auc, recall
117
118
119
# for explain dnn model
120
def ig(args, model, dataset, feature_names, omic_group, target=1):
121
    # prepare input data
122
    input_tensor_list = [data for data, labels in dataset]
123
    input_tensor = torch.cat(input_tensor_list, 0)
124
    input_tensor.requires_grad_()
125
126
    # instantiation
127
    ig = IntegratedGradients(model)
128
129
    # calculating feature importance using IG
130
    attr, _ = ig.attribute(input_tensor, return_convergence_delta=True, target=target)
131
    attr = attr.detach().numpy()
132
    feat_importance = np.mean(attr, axis=0)
133
134
    # result
135
    df_imp = pd.DataFrame({'Feature': feature_names,
136
                           'Omic': omic_group,
137
                           'Target': [target] * len(feature_names),
138
                           'Importance_value': feat_importance,
139
                           'Importance_value_abs': abs(feat_importance)
140
                           })
141
    df_imp = df_imp.sort_values('Importance_value_abs', ascending=False)
142
143
    # output
144
    if args.FSD:
145
        if args.clin_file:
146
            df_imp.to_csv(os.path.join(args.outdir, 'feature_importance_FSD_{}_clin_DNN_{}_target{}.csv'.format(args.method, args.omic_name, target)), index=False)
147
        else:
148
            df_imp.to_csv(os.path.join(args.outdir, 'feature_importance_FSD_{}_DNN_{}_target{}.csv'.format(args.method, args.omic_name, target)),index=False)
149
    else:
150
        if args.clin_file:
151
            df_imp.to_csv(os.path.join(args.outdir, 'feature_importance_{}_clin_DNN_{}_target{}.csv'.format(args.method, args.omic_name, target)), index=False)
152
        else:
153
            df_imp.to_csv(os.path.join(args.outdir, 'feature_importance_{}_DNN_{}_target{}.csv'.format(args.method, args.omic_name, target)),index=False)
154
155
    return df_imp
156
157
 # for explain net model
158
def ig_net(args, model, dataset, feature_names, omic_group, target=1):
159
    # prepare input data
160
    input_tensor_dna, input_tensor_rna = [], []
161
    for data_dna, data_rna, labels in dataset:
162
        input_tensor_dna.append(data_dna)
163
        input_tensor_rna.append(data_rna)
164
    input_tensor_dna = torch.cat(input_tensor_dna, 0).requires_grad_()
165
    input_tensor_rna = torch.cat(input_tensor_rna, 0).requires_grad_()
166
167
    # instantiation
168
    ig = IntegratedGradients(model)
169
170
    # calculating feature importance using IG
171
    attr, _ = ig.attribute((input_tensor_dna, input_tensor_rna), return_convergence_delta=True, target=target)
172
    feat_importance = []
173
    for tensor in attr:
174
        tensor = tensor.detach().numpy()
175
        feat_importance.append(np.mean(tensor, axis=0))
176
177
    # result
178
    df_imp = pd.DataFrame({'Feature': feature_names,
179
                           'Omic': omic_group,
180
                           'Target': [target] * len(feature_names),
181
                           'Importance_value': np.concatenate(feat_importance),
182
                           'Importance_value_abs':  abs(np.concatenate(feat_importance))
183
                           })
184
    df_imp = df_imp.sort_values('Importance_value_abs', ascending=False)
185
186
    # output
187
    if args.FSD:
188
        if args.clin_file:
189
            df_imp.to_csv(
190
                os.path.join(args.outdir, 'feature_importance_FSD_{}_clin_Net_{}_target{}.csv'.format(args.method, args.omic_name, target)),
191
                index=False)
192
        else:
193
            df_imp.to_csv(
194
                os.path.join(args.outdir, 'feature_importance_FSD_{}_Net_{}_target{}.csv'.format(args.method, args.omic_name, target)),
195
                index=False)
196
    else:
197
        if args.clin_file:
198
            df_imp.to_csv(
199
                os.path.join(args.outdir, 'feature_importance_{}_clin_Net_{}_target{}.csv'.format(args.method, args.omic_name, target)),
200
                index=False)
201
        else:
202
            df_imp.to_csv(os.path.join(args.outdir, 'feature_importance_{}_Net_{}_target{}.csv'.format(args.method, args.omic_name, target)),
203
                          index=False)
204
    return df_imp