--- a +++ b/eda/create_label_file.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# Custom module for dealing with global project paths and functions related to injesting and accessing raw data + +import os +import sys +import pandas as pd +import numpy as np +from tqdm.auto import tqdm +tqdm.pandas() + + +CSV_DIRECTORY = "." +DATA_DIRECTORY = "../../data/stage_1_train_images/" +DUPLICATES = ['ID_a64d5deed.dcm','ID_921490062.dcm','ID_489ae4179.dcm','ID_854fba667.dcm','ID_854fba667.dcm'] +MASTER_CSV = os.path.join(CSV_DIRECTORY, "master_train.csv") + +def split_targets(x): + targets = x["targets"][1:-1].split(" ") + x["epidural"] = float(targets[0]) + x["intraparenchymal"] = float(targets[1]) + x["intraventricular"] = float(targets[2]) + x["subarachnoid"] = float(targets[3]) + x["subdural"] = float(targets[4]) + x["any"] = float(targets[5]) + return x + +if os.path.exists(MASTER_CSV): + master_df = pd.read_csv(MASTER_CSV) + + # Split the targets + master_df = master_df.progress_apply(split_targets, axis=1) + +else: + train_df = pd.read_csv(os.path.join(CSV_DIRECTORY, "stage_1_train.csv")) + train_df["filename"] = train_df["ID"].apply(lambda st: "ID_" + st.split("_")[1] + ".dcm") + train_df["type"] = train_df["ID"].apply(lambda st: st.split("_")[2]) + + # New pandas dataframe with the target labels organized into a numpy array + master_df = train_df[["Label", "filename", "type"]]\ + .drop_duplicates()\ + .pivot(index="filename", columns="type", values="Label")\ + .reset_index() + + master_df["targets"] = master_df.progress_apply(lambda x: np.array([float(x["epidural"]), + float(x["intraparenchymal"]), + float(x["intraventricular"]), + float(x["subarachnoid"]), + float(x["subdural"]), + float(x["any"])]), axis=1) + + master_df["any"] = master_df.progress_apply(lambda x: float(x["any"]), axis=1) + + # save the master df as a csv + master_df.to_csv(os.path.join(CSV_DIRECTORY, "master_training.csv"), index=False) + print("Created and saved a master training CSV to disk. You're welcome...") + +# We have a master DF, lets create two sub DFs for +class1_df = master_df.loc[master_df['any'] == 1] # 97103 class 1 (14% of the data) +class0_df = master_df.loc[master_df['any'] == 0] # 577155 class 0 + + +assert class0_df.shape[0] + class1_df.shape[0] == master_df.shape[0] + +# Shuffle and randomly undersample class 0 +class0_df = class0_df.sample(frac=1, random_state=13).reset_index(drop=True) +class0_df = class0_df.sample(n=class1_df.shape[0], random_state=13) #50/50 split + +# Reconstitute balanced dataset, shuffle whole dataset +balanced_df = pd.concat([class1_df, class0_df], ignore_index=True) +balanced_df = balanced_df.sample(frac=1, random_state=13).reset_index(drop=True) + +# Create random train/validation sets and save to csv +train_df = balanced_df.sample(frac=0.90, random_state=13) #random state is a seed value +validation_df = balanced_df.drop(train_df.index) +test_df = None #TODO if we require or want this,possibly for CV, but for now train/val are fine. +assert train_df.shape[0] + validation_df.shape[0] == balanced_df.shape[0] + +train_df.to_csv("../src/training.csv", index=False, header=True) +validation_df.to_csv("../src/validation.csv", index=False, header=True) +# test_df.to_csv("../src/validation.csv", index=True)