[b52eda]: / Excel_Processing.py

Download this file

33 lines (27 with data), 1.5 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import pandas as pd
def ProcessSpreadsheets(dataset_info_path, scan_info_path) -> pd.DataFrame:
r"""
Method for processing ImageCHD's two Excel spreadsheets.
SPREADSHEETS NEED TO BE PRE-PROCESSED THEMSELVES BEFORE CALLING THIS METHOD!
Arguments:
dataset_info_path (string): path to the imageCHD_dataset_info.xlsx file
scan_info_path (string): path to the imageCHD_dataset_image_info.xlsx file
"""
dataset_info = pd.read_excel(io = dataset_info_path, sheet_name = 'classification dataset')
scan_info = pd.read_excel(io = scan_info_path, sheet_name = 'Sheet1')
dataset_info = pd.concat([dataset_info, scan_info], axis = 1)
ignore = dataset_info[dataset_info["IGNORED"] > 0].index
drop_cols = ["DORV", "CAT", "APVC", "DSVC", "AAH", "DAA", "IAA", "PAS",
"NORMAL", "ONLYFIRST8", "FIRST8+MORE", "NORMAL.1", "IGNORED",
"idx", "PatientBirthDate1", "AcquisitionDate1",
"PixelSpacing1", "PixelSpacing2", "calculate_z_thick",
"ManufacturerModelName", "AGE", "UNKNOWN"]
dataset_info = dataset_info.drop(drop_cols, axis = 1) \
.drop(ignore) \
.reset_index() \
.drop("level_0", axis = 1)
nan_cols = ["ASD", "VSD", "AVSD", "ToF", "TGA", "CA", "PA", "PDA"]
for col in nan_cols:
mask = dataset_info[dataset_info[col] != 1].index
dataset_info.loc[mask, col] = 0
return dataset_info