Switch to unified view

a b/AttentionMOI/src/preprocess.py
1
import sys
2
import pandas as pd
3
4
5
def read_(file):
6
    # read file
7
    if file.endswith('.csv'):
8
        df = pd.read_csv(file, index_col=0)
9
    elif file.endswith('.csv.gz'):
10
        df = pd.read_csv(file, compression='gzip', index_col=0)
11
    else:
12
        print('\n[Error]: The program cannot infer the format of {} . Currently, only the csv format is supported, please ensure that the file name suffix is .csv or .csv.gz.'.format(file))
13
        sys.exit(0)
14
    return df
15
16
17
def read_omics(args):
18
    omics = []
19
    for file in args.omic_file:
20
        df = read_(file)
21
        df = df.fillna(0)  # fill nan with 0
22
        omics.append(df)
23
    return omics
24
25
26
def read_label(args):
27
    file = args.label_file
28
    df = read_(file)
29
    df = df.rename(
30
        columns={df.columns.values[0]: 'label'})
31
    return df
32
33
34
def read_clin(args):
35
    file = args.clin_file
36
    df = None
37
    if not file is None:
38
        df = read_(file)
39
        # fill na
40
        df = df.fillna(0)
41
    return df
42
43
def process(df_omics, df_label, df_clin):
44
    # extract patient id
45
    patients = [df_tmp.index.to_list() for df_tmp in df_omics]
46
    patients.append(df_label.index.to_list())
47
    if not df_clin is None:
48
        patients.append(df_clin.index.to_list())
49
50
    # get shared patients between different data
51
    patients_shared = patients[0]
52
    for i in range(1, len(patients)):
53
        patients_shared = list(set(patients_shared).intersection(patients[i]))
54
55
    # extract shared patients' data
56
    for i in range(len(df_omics)):
57
        df_omics[i] = df_omics[i].loc[patients_shared, :].sort_index()
58
    df_label = df_label.loc[patients_shared, :].sort_index()
59
    if not df_clin is None:
60
        df_clin = df_clin.loc[patients_shared, :].sort_index()
61
    return df_omics, df_label, df_clin
62
63
64
# api
65
def read_dataset(args):
66
    # 1. read raw dataset
67
    # (1) read omics dataset
68
    df_omics = read_omics(args)
69
    # (2) read label
70
    df_label = read_label(args)
71
    # (3) read clinical feature
72
    df_clin = read_clin(args)
73
74
    # 2. process
75
    df_omics, df_label, df_clin = process(df_omics, df_label, df_clin)
76
77
    # 3. return clean dataset
78
    return df_omics, df_label, df_clin