Diff of /Excel_Processing.py [000000] .. [b52eda]

Switch to unified view

a b/Excel_Processing.py
1
import pandas as pd
2
3
def ProcessSpreadsheets(dataset_info_path, scan_info_path) -> pd.DataFrame:
4
    r"""
5
    Method for processing ImageCHD's two Excel spreadsheets.
6
    SPREADSHEETS NEED TO BE PRE-PROCESSED THEMSELVES BEFORE CALLING THIS METHOD!
7
8
    Arguments:
9
      dataset_info_path (string): path to the imageCHD_dataset_info.xlsx file
10
      scan_info_path (string): path to the imageCHD_dataset_image_info.xlsx file
11
    """
12
    dataset_info = pd.read_excel(io = dataset_info_path, sheet_name = 'classification dataset')
13
    scan_info = pd.read_excel(io = scan_info_path, sheet_name = 'Sheet1')
14
    dataset_info = pd.concat([dataset_info, scan_info], axis = 1)
15
16
    ignore = dataset_info[dataset_info["IGNORED"] > 0].index
17
    drop_cols = ["DORV", "CAT", "APVC", "DSVC", "AAH", "DAA", "IAA", "PAS",
18
                "NORMAL", "ONLYFIRST8", "FIRST8+MORE", "NORMAL.1", "IGNORED",
19
                "idx", "PatientBirthDate1", "AcquisitionDate1",
20
                "PixelSpacing1", "PixelSpacing2", "calculate_z_thick",
21
                "ManufacturerModelName", "AGE", "UNKNOWN"]
22
23
    dataset_info = dataset_info.drop(drop_cols, axis = 1) \
24
                            .drop(ignore) \
25
                            .reset_index() \
26
                            .drop("level_0", axis = 1)
27
28
    nan_cols = ["ASD", "VSD", "AVSD", "ToF", "TGA", "CA", "PA", "PDA"]
29
    for col in nan_cols:
30
        mask = dataset_info[dataset_info[col] != 1].index
31
        dataset_info.loc[mask, col] = 0
32
    
33
    return dataset_info