[785f18]: / preprocessing / preprocessing.py

Download this file

104 lines (93 with data), 3.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
"""
Helper methods for reading data
Leon Zheng
"""
import pandas as pd
import numpy as np
from sklearn import preprocessing
def read_input(file_radiomics, file_clinical):
"""
Read radiomics and clinical feature and return dataframe.
:param file_radiomics: filename
:param file_clinical: filename
:return:
"""
radiomics = pd.read_csv(file_radiomics, index_col=0)
clinical = pd.read_csv(file_clinical, index_col=0)
clinical = cleaning_clinical(clinical)
input = pd.concat([radiomics, clinical], axis=1)
return input
def normalizing_input(x_train, x_test):
all_x = pd.concat([x_train, x_test])
x_val = all_x.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x_val)
df = pd.DataFrame(x_scaled, index=all_x.index, columns=all_x.columns)
return df.loc[x_train.index], df.loc[x_test.index]
def cleaning_clinical(clinical):
"""
Cleaning the clinical dataframe.
:param clinical:
:return:
"""
# Encoding label for SourceDataset
le = preprocessing.LabelEncoder()
le.fit(clinical['SourceDataset'])
le.transform(clinical['SourceDataset'])
clinical['SourceDataset'] = le.transform(clinical['SourceDataset'])
# Cleaning Histology
clinical.replace("NSCLC NOS (not otherwise specified)", "nos", inplace=True)
clinical.replace("Adenocarcinoma", "adenocarcinoma", inplace=True)
clinical.replace("Squamous cell carcinoma", "squamous cell carcinoma", inplace=True)
# Dummies for Histology
clinical = pd.get_dummies(clinical)
# Fill age nan
clinical['age'].fillna((clinical['age'].mean()), inplace=True)
return clinical
def read_output(file_output):
"""
Return dataframe for event + survival time.
:param file_output:
:return:
"""
output = pd.read_csv(file_output, index_col=0, header=0)
return output
def clean_clinical_data(file, newfile):
"""
Cleaning clinical data.
:param file: clinical data file
:param newfile: path for the cleaned data file.
:return:
"""
clinical = pd.read_csv(file, index_col=0)
cleaned = cleaning_clinical(clinical)
cleaned.to_csv(newfile)
def y_dataframe_to_rsf_input(y_df):
"""
Input for random survival forest.
:param y_df: event + survival time dataframe.
:return:
"""
y_array = []
Y = y_df.to_numpy()
for y in Y:
tuple = (bool(y[1]), y[0])
y_array.append(tuple)
return np.array(y_array, dtype = [(f'{y_df.columns[1]}', np.bool), (f'{y_df.columns[0]}', np.float)])
def load_owkin_data(radiomics_path_train="data/train/features/radiomics.csv",
clinical_path_train="data/train/features/clinical_data.csv",
label_path_train='data/train/y_train.csv',
radiomics_path_test="data/test/features/radiomics.csv",
clinical_path_test="data/test/features/clinical_data.csv"):
"""
Load Owkin data: return PyRadiomics + clinical features of training set in dataframe,
event + time of training set in dataframe, and
PyRadiomics + clinical features of testing set in dataframe,
"""
input_train = read_input(radiomics_path_train, clinical_path_train)
output_train = read_output(label_path_train)
input_test = read_input(radiomics_path_test, clinical_path_test)
return input_train, output_train, input_test
if __name__ == '__main__':
input_train, output_train, input_test = load_owkin_data()
print(input_train, output_train, input_test)