|
a |
|
b/preprocessing/preprocessing.py |
|
|
1 |
""" |
|
|
2 |
Helper methods for reading data |
|
|
3 |
Leon Zheng |
|
|
4 |
""" |
|
|
5 |
|
|
|
6 |
import pandas as pd |
|
|
7 |
import numpy as np |
|
|
8 |
from sklearn import preprocessing |
|
|
9 |
|
|
|
10 |
def read_input(file_radiomics, file_clinical): |
|
|
11 |
""" |
|
|
12 |
Read radiomics and clinical feature and return dataframe. |
|
|
13 |
:param file_radiomics: filename |
|
|
14 |
:param file_clinical: filename |
|
|
15 |
:return: |
|
|
16 |
""" |
|
|
17 |
radiomics = pd.read_csv(file_radiomics, index_col=0) |
|
|
18 |
clinical = pd.read_csv(file_clinical, index_col=0) |
|
|
19 |
clinical = cleaning_clinical(clinical) |
|
|
20 |
input = pd.concat([radiomics, clinical], axis=1) |
|
|
21 |
return input |
|
|
22 |
|
|
|
23 |
def normalizing_input(x_train, x_test): |
|
|
24 |
all_x = pd.concat([x_train, x_test]) |
|
|
25 |
x_val = all_x.values |
|
|
26 |
min_max_scaler = preprocessing.MinMaxScaler() |
|
|
27 |
x_scaled = min_max_scaler.fit_transform(x_val) |
|
|
28 |
df = pd.DataFrame(x_scaled, index=all_x.index, columns=all_x.columns) |
|
|
29 |
return df.loc[x_train.index], df.loc[x_test.index] |
|
|
30 |
|
|
|
31 |
def cleaning_clinical(clinical): |
|
|
32 |
""" |
|
|
33 |
Cleaning the clinical dataframe. |
|
|
34 |
:param clinical: |
|
|
35 |
:return: |
|
|
36 |
""" |
|
|
37 |
# Encoding label for SourceDataset |
|
|
38 |
le = preprocessing.LabelEncoder() |
|
|
39 |
le.fit(clinical['SourceDataset']) |
|
|
40 |
le.transform(clinical['SourceDataset']) |
|
|
41 |
clinical['SourceDataset'] = le.transform(clinical['SourceDataset']) |
|
|
42 |
# Cleaning Histology |
|
|
43 |
clinical.replace("NSCLC NOS (not otherwise specified)", "nos", inplace=True) |
|
|
44 |
clinical.replace("Adenocarcinoma", "adenocarcinoma", inplace=True) |
|
|
45 |
clinical.replace("Squamous cell carcinoma", "squamous cell carcinoma", inplace=True) |
|
|
46 |
# Dummies for Histology |
|
|
47 |
clinical = pd.get_dummies(clinical) |
|
|
48 |
# Fill age nan |
|
|
49 |
clinical['age'].fillna((clinical['age'].mean()), inplace=True) |
|
|
50 |
|
|
|
51 |
return clinical |
|
|
52 |
|
|
|
53 |
def read_output(file_output): |
|
|
54 |
""" |
|
|
55 |
Return dataframe for event + survival time. |
|
|
56 |
:param file_output: |
|
|
57 |
:return: |
|
|
58 |
""" |
|
|
59 |
output = pd.read_csv(file_output, index_col=0, header=0) |
|
|
60 |
return output |
|
|
61 |
|
|
|
62 |
def clean_clinical_data(file, newfile): |
|
|
63 |
""" |
|
|
64 |
Cleaning clinical data. |
|
|
65 |
:param file: clinical data file |
|
|
66 |
:param newfile: path for the cleaned data file. |
|
|
67 |
:return: |
|
|
68 |
""" |
|
|
69 |
clinical = pd.read_csv(file, index_col=0) |
|
|
70 |
cleaned = cleaning_clinical(clinical) |
|
|
71 |
cleaned.to_csv(newfile) |
|
|
72 |
|
|
|
73 |
def y_dataframe_to_rsf_input(y_df): |
|
|
74 |
""" |
|
|
75 |
Input for random survival forest. |
|
|
76 |
:param y_df: event + survival time dataframe. |
|
|
77 |
:return: |
|
|
78 |
""" |
|
|
79 |
y_array = [] |
|
|
80 |
Y = y_df.to_numpy() |
|
|
81 |
for y in Y: |
|
|
82 |
tuple = (bool(y[1]), y[0]) |
|
|
83 |
y_array.append(tuple) |
|
|
84 |
return np.array(y_array, dtype = [(f'{y_df.columns[1]}', np.bool), (f'{y_df.columns[0]}', np.float)]) |
|
|
85 |
|
|
|
86 |
def load_owkin_data(radiomics_path_train="data/train/features/radiomics.csv", |
|
|
87 |
clinical_path_train="data/train/features/clinical_data.csv", |
|
|
88 |
label_path_train='data/train/y_train.csv', |
|
|
89 |
radiomics_path_test="data/test/features/radiomics.csv", |
|
|
90 |
clinical_path_test="data/test/features/clinical_data.csv"): |
|
|
91 |
""" |
|
|
92 |
Load Owkin data: return PyRadiomics + clinical features of training set in dataframe, |
|
|
93 |
event + time of training set in dataframe, and |
|
|
94 |
PyRadiomics + clinical features of testing set in dataframe, |
|
|
95 |
""" |
|
|
96 |
input_train = read_input(radiomics_path_train, clinical_path_train) |
|
|
97 |
output_train = read_output(label_path_train) |
|
|
98 |
input_test = read_input(radiomics_path_test, clinical_path_test) |
|
|
99 |
return input_train, output_train, input_test |
|
|
100 |
|
|
|
101 |
if __name__ == '__main__': |
|
|
102 |
input_train, output_train, input_test = load_owkin_data() |
|
|
103 |
print(input_train, output_train, input_test) |