|
a |
|
b/AttentionMOI/src/preprocess.py |
|
|
1 |
import sys |
|
|
2 |
import pandas as pd |
|
|
3 |
|
|
|
4 |
|
|
|
5 |
def read_(file): |
|
|
6 |
# read file |
|
|
7 |
if file.endswith('.csv'): |
|
|
8 |
df = pd.read_csv(file, index_col=0) |
|
|
9 |
elif file.endswith('.csv.gz'): |
|
|
10 |
df = pd.read_csv(file, compression='gzip', index_col=0) |
|
|
11 |
else: |
|
|
12 |
print('\n[Error]: The program cannot infer the format of {} . Currently, only the csv format is supported, please ensure that the file name suffix is .csv or .csv.gz.'.format(file)) |
|
|
13 |
sys.exit(0) |
|
|
14 |
return df |
|
|
15 |
|
|
|
16 |
|
|
|
17 |
def read_omics(args): |
|
|
18 |
omics = [] |
|
|
19 |
for file in args.omic_file: |
|
|
20 |
df = read_(file) |
|
|
21 |
df = df.fillna(0) # fill nan with 0 |
|
|
22 |
omics.append(df) |
|
|
23 |
return omics |
|
|
24 |
|
|
|
25 |
|
|
|
26 |
def read_label(args): |
|
|
27 |
file = args.label_file |
|
|
28 |
df = read_(file) |
|
|
29 |
df = df.rename( |
|
|
30 |
columns={df.columns.values[0]: 'label'}) |
|
|
31 |
return df |
|
|
32 |
|
|
|
33 |
|
|
|
34 |
def read_clin(args): |
|
|
35 |
file = args.clin_file |
|
|
36 |
df = None |
|
|
37 |
if not file is None: |
|
|
38 |
df = read_(file) |
|
|
39 |
# fill na |
|
|
40 |
df = df.fillna(0) |
|
|
41 |
return df |
|
|
42 |
|
|
|
43 |
def process(df_omics, df_label, df_clin): |
|
|
44 |
# extract patient id |
|
|
45 |
patients = [df_tmp.index.to_list() for df_tmp in df_omics] |
|
|
46 |
patients.append(df_label.index.to_list()) |
|
|
47 |
if not df_clin is None: |
|
|
48 |
patients.append(df_clin.index.to_list()) |
|
|
49 |
|
|
|
50 |
# get shared patients between different data |
|
|
51 |
patients_shared = patients[0] |
|
|
52 |
for i in range(1, len(patients)): |
|
|
53 |
patients_shared = list(set(patients_shared).intersection(patients[i])) |
|
|
54 |
|
|
|
55 |
# extract shared patients' data |
|
|
56 |
for i in range(len(df_omics)): |
|
|
57 |
df_omics[i] = df_omics[i].loc[patients_shared, :].sort_index() |
|
|
58 |
df_label = df_label.loc[patients_shared, :].sort_index() |
|
|
59 |
if not df_clin is None: |
|
|
60 |
df_clin = df_clin.loc[patients_shared, :].sort_index() |
|
|
61 |
return df_omics, df_label, df_clin |
|
|
62 |
|
|
|
63 |
|
|
|
64 |
# api |
|
|
65 |
def read_dataset(args): |
|
|
66 |
# 1. read raw dataset |
|
|
67 |
# (1) read omics dataset |
|
|
68 |
df_omics = read_omics(args) |
|
|
69 |
# (2) read label |
|
|
70 |
df_label = read_label(args) |
|
|
71 |
# (3) read clinical feature |
|
|
72 |
df_clin = read_clin(args) |
|
|
73 |
|
|
|
74 |
# 2. process |
|
|
75 |
df_omics, df_label, df_clin = process(df_omics, df_label, df_clin) |
|
|
76 |
|
|
|
77 |
# 3. return clean dataset |
|
|
78 |
return df_omics, df_label, df_clin |