--- a +++ b/preprocessing/preprocessing.py @@ -0,0 +1,103 @@ +""" +Helper methods for reading data +Leon Zheng +""" + +import pandas as pd +import numpy as np +from sklearn import preprocessing + +def read_input(file_radiomics, file_clinical): + """ + Read radiomics and clinical feature and return dataframe. + :param file_radiomics: filename + :param file_clinical: filename + :return: + """ + radiomics = pd.read_csv(file_radiomics, index_col=0) + clinical = pd.read_csv(file_clinical, index_col=0) + clinical = cleaning_clinical(clinical) + input = pd.concat([radiomics, clinical], axis=1) + return input + +def normalizing_input(x_train, x_test): + all_x = pd.concat([x_train, x_test]) + x_val = all_x.values + min_max_scaler = preprocessing.MinMaxScaler() + x_scaled = min_max_scaler.fit_transform(x_val) + df = pd.DataFrame(x_scaled, index=all_x.index, columns=all_x.columns) + return df.loc[x_train.index], df.loc[x_test.index] + +def cleaning_clinical(clinical): + """ + Cleaning the clinical dataframe. + :param clinical: + :return: + """ + # Encoding label for SourceDataset + le = preprocessing.LabelEncoder() + le.fit(clinical['SourceDataset']) + le.transform(clinical['SourceDataset']) + clinical['SourceDataset'] = le.transform(clinical['SourceDataset']) + # Cleaning Histology + clinical.replace("NSCLC NOS (not otherwise specified)", "nos", inplace=True) + clinical.replace("Adenocarcinoma", "adenocarcinoma", inplace=True) + clinical.replace("Squamous cell carcinoma", "squamous cell carcinoma", inplace=True) + # Dummies for Histology + clinical = pd.get_dummies(clinical) + # Fill age nan + clinical['age'].fillna((clinical['age'].mean()), inplace=True) + + return clinical + +def read_output(file_output): + """ + Return dataframe for event + survival time. + :param file_output: + :return: + """ + output = pd.read_csv(file_output, index_col=0, header=0) + return output + +def clean_clinical_data(file, newfile): + """ + Cleaning clinical data. + :param file: clinical data file + :param newfile: path for the cleaned data file. + :return: + """ + clinical = pd.read_csv(file, index_col=0) + cleaned = cleaning_clinical(clinical) + cleaned.to_csv(newfile) + +def y_dataframe_to_rsf_input(y_df): + """ + Input for random survival forest. + :param y_df: event + survival time dataframe. + :return: + """ + y_array = [] + Y = y_df.to_numpy() + for y in Y: + tuple = (bool(y[1]), y[0]) + y_array.append(tuple) + return np.array(y_array, dtype = [(f'{y_df.columns[1]}', np.bool), (f'{y_df.columns[0]}', np.float)]) + +def load_owkin_data(radiomics_path_train="data/train/features/radiomics.csv", + clinical_path_train="data/train/features/clinical_data.csv", + label_path_train='data/train/y_train.csv', + radiomics_path_test="data/test/features/radiomics.csv", + clinical_path_test="data/test/features/clinical_data.csv"): + """ + Load Owkin data: return PyRadiomics + clinical features of training set in dataframe, + event + time of training set in dataframe, and + PyRadiomics + clinical features of testing set in dataframe, + """ + input_train = read_input(radiomics_path_train, clinical_path_train) + output_train = read_output(label_path_train) + input_test = read_input(radiomics_path_test, clinical_path_test) + return input_train, output_train, input_test + +if __name__ == '__main__': + input_train, output_train, input_test = load_owkin_data() + print(input_train, output_train, input_test)