|
a |
|
b/Excel_Processing.py |
|
|
1 |
import pandas as pd |
|
|
2 |
|
|
|
3 |
def ProcessSpreadsheets(dataset_info_path, scan_info_path) -> pd.DataFrame: |
|
|
4 |
r""" |
|
|
5 |
Method for processing ImageCHD's two Excel spreadsheets. |
|
|
6 |
SPREADSHEETS NEED TO BE PRE-PROCESSED THEMSELVES BEFORE CALLING THIS METHOD! |
|
|
7 |
|
|
|
8 |
Arguments: |
|
|
9 |
dataset_info_path (string): path to the imageCHD_dataset_info.xlsx file |
|
|
10 |
scan_info_path (string): path to the imageCHD_dataset_image_info.xlsx file |
|
|
11 |
""" |
|
|
12 |
dataset_info = pd.read_excel(io = dataset_info_path, sheet_name = 'classification dataset') |
|
|
13 |
scan_info = pd.read_excel(io = scan_info_path, sheet_name = 'Sheet1') |
|
|
14 |
dataset_info = pd.concat([dataset_info, scan_info], axis = 1) |
|
|
15 |
|
|
|
16 |
ignore = dataset_info[dataset_info["IGNORED"] > 0].index |
|
|
17 |
drop_cols = ["DORV", "CAT", "APVC", "DSVC", "AAH", "DAA", "IAA", "PAS", |
|
|
18 |
"NORMAL", "ONLYFIRST8", "FIRST8+MORE", "NORMAL.1", "IGNORED", |
|
|
19 |
"idx", "PatientBirthDate1", "AcquisitionDate1", |
|
|
20 |
"PixelSpacing1", "PixelSpacing2", "calculate_z_thick", |
|
|
21 |
"ManufacturerModelName", "AGE", "UNKNOWN"] |
|
|
22 |
|
|
|
23 |
dataset_info = dataset_info.drop(drop_cols, axis = 1) \ |
|
|
24 |
.drop(ignore) \ |
|
|
25 |
.reset_index() \ |
|
|
26 |
.drop("level_0", axis = 1) |
|
|
27 |
|
|
|
28 |
nan_cols = ["ASD", "VSD", "AVSD", "ToF", "TGA", "CA", "PA", "PDA"] |
|
|
29 |
for col in nan_cols: |
|
|
30 |
mask = dataset_info[dataset_info[col] != 1].index |
|
|
31 |
dataset_info.loc[mask, col] = 0 |
|
|
32 |
|
|
|
33 |
return dataset_info |