Switch to unified view

a b/preprocessing/preprocessing.py
1
"""
2
Helper methods for reading data
3
Leon Zheng
4
"""
5
6
import pandas as pd
7
import numpy as np
8
from sklearn import preprocessing
9
10
def read_input(file_radiomics, file_clinical):
11
    """
12
    Read radiomics and clinical feature and return dataframe.
13
    :param file_radiomics: filename
14
    :param file_clinical: filename
15
    :return:
16
    """
17
    radiomics = pd.read_csv(file_radiomics, index_col=0)
18
    clinical = pd.read_csv(file_clinical, index_col=0)
19
    clinical = cleaning_clinical(clinical)
20
    input = pd.concat([radiomics, clinical], axis=1)
21
    return input
22
23
def normalizing_input(x_train, x_test):
24
    all_x = pd.concat([x_train, x_test])
25
    x_val = all_x.values
26
    min_max_scaler = preprocessing.MinMaxScaler()
27
    x_scaled = min_max_scaler.fit_transform(x_val)
28
    df = pd.DataFrame(x_scaled, index=all_x.index, columns=all_x.columns)
29
    return df.loc[x_train.index], df.loc[x_test.index]
30
31
def cleaning_clinical(clinical):
32
    """
33
    Cleaning the clinical dataframe.
34
    :param clinical:
35
    :return:
36
    """
37
    # Encoding label for SourceDataset
38
    le = preprocessing.LabelEncoder()
39
    le.fit(clinical['SourceDataset'])
40
    le.transform(clinical['SourceDataset'])
41
    clinical['SourceDataset'] = le.transform(clinical['SourceDataset'])
42
    # Cleaning Histology
43
    clinical.replace("NSCLC NOS (not otherwise specified)", "nos", inplace=True)
44
    clinical.replace("Adenocarcinoma", "adenocarcinoma", inplace=True)
45
    clinical.replace("Squamous cell carcinoma", "squamous cell carcinoma", inplace=True)
46
    # Dummies for Histology
47
    clinical = pd.get_dummies(clinical)
48
    # Fill age nan
49
    clinical['age'].fillna((clinical['age'].mean()), inplace=True)
50
51
    return clinical
52
53
def read_output(file_output):
54
    """
55
    Return dataframe for event + survival time.
56
    :param file_output:
57
    :return:
58
    """
59
    output = pd.read_csv(file_output, index_col=0, header=0)
60
    return output
61
62
def clean_clinical_data(file, newfile):
63
    """
64
    Cleaning clinical data.
65
    :param file: clinical data file
66
    :param newfile: path for the cleaned data file.
67
    :return:
68
    """
69
    clinical = pd.read_csv(file, index_col=0)
70
    cleaned = cleaning_clinical(clinical)
71
    cleaned.to_csv(newfile)
72
73
def y_dataframe_to_rsf_input(y_df):
74
    """
75
    Input for random survival forest.
76
    :param y_df: event + survival time dataframe.
77
    :return:
78
    """
79
    y_array = []
80
    Y = y_df.to_numpy()
81
    for y in Y:
82
        tuple = (bool(y[1]), y[0])
83
        y_array.append(tuple)
84
    return np.array(y_array, dtype = [(f'{y_df.columns[1]}', np.bool), (f'{y_df.columns[0]}', np.float)])
85
86
def load_owkin_data(radiomics_path_train="data/train/features/radiomics.csv",
87
                    clinical_path_train="data/train/features/clinical_data.csv",
88
                    label_path_train='data/train/y_train.csv',
89
                    radiomics_path_test="data/test/features/radiomics.csv",
90
                    clinical_path_test="data/test/features/clinical_data.csv"):
91
    """
92
    Load Owkin data: return PyRadiomics + clinical features of training set in dataframe,
93
                    event + time of training set in dataframe, and
94
                    PyRadiomics + clinical features of testing set in dataframe,
95
    """
96
    input_train = read_input(radiomics_path_train, clinical_path_train)
97
    output_train = read_output(label_path_train)
98
    input_test = read_input(radiomics_path_test, clinical_path_test)
99
    return input_train, output_train, input_test
100
101
if __name__ == '__main__':
102
    input_train, output_train, input_test = load_owkin_data()
103
    print(input_train, output_train, input_test)