Diff of /data_preprocess.py [000000] .. [2d53aa]

Switch to unified view

a b/data_preprocess.py
1
import numpy as np
2
import pandas as pd
3
4
5
def data_preprocess(input_path, file_delimiter='\t', file_header=None, file_index_col=None, file_transpose=False,
6
                    norm_0_1=True, probe_filter=None, statistic_select_num=0, statistic_metrics='std'):
7
    print('Loading the input data...')
8
    full_input_path = '../data/' + input_path
9
    # use float32 to save memory
10
    data_df_test = pd.read_csv(full_input_path, sep=file_delimiter, header=file_header, index_col=file_index_col,
11
                               nrows=10)
12
    cols_f32 = {col: np.float32 for col in data_df_test}
13
    data_df = pd.read_csv(full_input_path, sep=file_delimiter, header=file_header, index_col=file_index_col,
14
                          dtype=cols_f32)
15
16
    print('Pre-processing the input data...')
17
    # Delete selected probes
18
    if probe_filter:
19
        filter_list = np.loadtxt(probe_filter, delimiter='\t', dtype=str)
20
        if file_transpose:
21
            data_df = data_df.drop(filter_list)
22
        else:
23
            data_df = data_df.drop(filter_list, axis=1)
24
25
    # Deal with nan value
26
    if file_transpose:
27
        data_df.dropna(axis=0, thresh=data_df.shape[1] * 0.9, inplace=True)
28
        data_df.dropna(axis=1, thresh=data_df.shape[0] * 0.9, inplace=True)
29
    else:
30
        data_df.dropna(axis=1, thresh=data_df.shape[0] * 0.9, inplace=True)
31
        data_df.dropna(axix=0, thresh=data_df.shape[1] * 0.9, inplace=True)
32
33
    # Use feature average to fill na
34
    if file_transpose:
35
        row_mean = data_df.mean(axis=1)
36
        for col_index, col_name in enumerate(data_df):
37
            data_df.iloc[:, col_index].fillna(row_mean, inplace=True)
38
    else:
39
        data_df.fillna(data_df.mean(axis=0), inplace=True)
40
41
    # Normalize the dataframe to the range of 0-1
42
    if norm_0_1:
43
        # Min-max normalization
44
        data_df = (data_df - data_df.min().min()) / (data_df.max().max() - data_df.min().min())
45
46
    input_path_name = input_path.split('.')[0]
47
48
    # Select certain number of probes according to some statistic metrics
49
    if statistic_select_num > 0:
50
        if statistic_metrics == 'mad':
51
            if file_transpose:
52
                select_index = data_df.mad(axis=1).sort_values(ascending=False)[:statistic_select_num].index
53
                data_df = data_df.loc[select_index]
54
            else:
55
                select_column = data_df.mad(axis=0).sort_values(ascending=False)[:statistic_select_num].index
56
                data_df = data_df.loc[:, select_column]
57
        else:
58
            if file_transpose:
59
                select_index = data_df.std(axis=1).sort_values(ascending=False)[:statistic_select_num].index
60
                data_df = data_df.loc[select_index]
61
            else:
62
                select_column = data_df.std(axis=0).sort_values(ascending=False)[:statistic_select_num].index
63
                data_df = data_df.loc[:, select_column]
64
        output_path = '../data/' + input_path_name + '_' + str(statistic_select_num) + '_' + statistic_metrics + '.tsv'
65
        data_df.to_csv(output_path, sep='\t')
66
    else:
67
        output_path = '../data/' + input_path_name + '_preprocessed.tsv'
68
        data_df.to_csv(output_path, sep='\t')
69
70
    return data_df