--- a +++ b/Excel_Processing.py @@ -0,0 +1,33 @@ +import pandas as pd + +def ProcessSpreadsheets(dataset_info_path, scan_info_path) -> pd.DataFrame: + r""" + Method for processing ImageCHD's two Excel spreadsheets. + SPREADSHEETS NEED TO BE PRE-PROCESSED THEMSELVES BEFORE CALLING THIS METHOD! + + Arguments: + dataset_info_path (string): path to the imageCHD_dataset_info.xlsx file + scan_info_path (string): path to the imageCHD_dataset_image_info.xlsx file + """ + dataset_info = pd.read_excel(io = dataset_info_path, sheet_name = 'classification dataset') + scan_info = pd.read_excel(io = scan_info_path, sheet_name = 'Sheet1') + dataset_info = pd.concat([dataset_info, scan_info], axis = 1) + + ignore = dataset_info[dataset_info["IGNORED"] > 0].index + drop_cols = ["DORV", "CAT", "APVC", "DSVC", "AAH", "DAA", "IAA", "PAS", + "NORMAL", "ONLYFIRST8", "FIRST8+MORE", "NORMAL.1", "IGNORED", + "idx", "PatientBirthDate1", "AcquisitionDate1", + "PixelSpacing1", "PixelSpacing2", "calculate_z_thick", + "ManufacturerModelName", "AGE", "UNKNOWN"] + + dataset_info = dataset_info.drop(drop_cols, axis = 1) \ + .drop(ignore) \ + .reset_index() \ + .drop("level_0", axis = 1) + + nan_cols = ["ASD", "VSD", "AVSD", "ToF", "TGA", "CA", "PA", "PDA"] + for col in nan_cols: + mask = dataset_info[dataset_info[col] != 1].index + dataset_info.loc[mask, col] = 0 + + return dataset_info \ No newline at end of file