Diff of /eda/create_label_file.py [000000] .. [fb2ce2]

Switch to side-by-side view

--- a
+++ b/eda/create_label_file.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+# Custom module for dealing with global project paths and functions related to injesting and accessing raw data
+
+import os
+import sys
+import pandas as pd
+import numpy as np
+from tqdm.auto import tqdm
+tqdm.pandas()
+
+
+CSV_DIRECTORY = "."
+DATA_DIRECTORY = "../../data/stage_1_train_images/"
+DUPLICATES = ['ID_a64d5deed.dcm','ID_921490062.dcm','ID_489ae4179.dcm','ID_854fba667.dcm','ID_854fba667.dcm']
+MASTER_CSV = os.path.join(CSV_DIRECTORY, "master_train.csv")
+
+def split_targets(x):
+    targets = x["targets"][1:-1].split(" ")
+    x["epidural"] = float(targets[0])
+    x["intraparenchymal"] = float(targets[1])
+    x["intraventricular"] = float(targets[2])
+    x["subarachnoid"] = float(targets[3])
+    x["subdural"] = float(targets[4])
+    x["any"] = float(targets[5])
+    return x
+
+if os.path.exists(MASTER_CSV):
+    master_df = pd.read_csv(MASTER_CSV)
+
+    # Split the targets
+    master_df = master_df.progress_apply(split_targets, axis=1)
+
+else:
+    train_df = pd.read_csv(os.path.join(CSV_DIRECTORY, "stage_1_train.csv"))
+    train_df["filename"] = train_df["ID"].apply(lambda st: "ID_" + st.split("_")[1] + ".dcm")
+    train_df["type"] = train_df["ID"].apply(lambda st: st.split("_")[2])
+
+    # New pandas dataframe with the target labels organized into a numpy array
+    master_df = train_df[["Label", "filename", "type"]]\
+                    .drop_duplicates()\
+                    .pivot(index="filename", columns="type", values="Label")\
+                    .reset_index()
+
+    master_df["targets"] = master_df.progress_apply(lambda x: np.array([float(x["epidural"]),
+                                                                    float(x["intraparenchymal"]),
+                                                                    float(x["intraventricular"]),
+                                                                    float(x["subarachnoid"]),
+                                                                    float(x["subdural"]),
+                                                                    float(x["any"])]), axis=1)
+
+    master_df["any"] = master_df.progress_apply(lambda x: float(x["any"]), axis=1)
+
+    # save the master df as a csv
+    master_df.to_csv(os.path.join(CSV_DIRECTORY, "master_training.csv"), index=False)
+    print("Created and saved a master training CSV to disk. You're welcome...")
+
+# We have a master DF, lets create two sub DFs for
+class1_df = master_df.loc[master_df['any'] == 1] # 97103 class 1 (14% of the data)
+class0_df = master_df.loc[master_df['any'] == 0] # 577155 class 0
+
+
+assert class0_df.shape[0] + class1_df.shape[0] == master_df.shape[0]
+
+# Shuffle and randomly undersample class 0
+class0_df = class0_df.sample(frac=1, random_state=13).reset_index(drop=True)
+class0_df = class0_df.sample(n=class1_df.shape[0], random_state=13) #50/50 split
+
+# Reconstitute balanced dataset, shuffle whole dataset
+balanced_df = pd.concat([class1_df, class0_df], ignore_index=True)
+balanced_df = balanced_df.sample(frac=1, random_state=13).reset_index(drop=True)
+
+# Create random train/validation sets and save to csv
+train_df = balanced_df.sample(frac=0.90, random_state=13) #random state is a seed value
+validation_df = balanced_df.drop(train_df.index)
+test_df = None #TODO if we require or want this,possibly for CV, but for now train/val are fine.
+assert train_df.shape[0] + validation_df.shape[0] == balanced_df.shape[0]
+
+train_df.to_csv("../src/training.csv", index=False, header=True)
+validation_df.to_csv("../src/validation.csv", index=False, header=True)
+# test_df.to_csv("../src/validation.csv", index=True)