Diff of /src/data_loader.py [000000] .. [fb2ce2]

Switch to side-by-side view

--- a
+++ b/src/data_loader.py
@@ -0,0 +1,179 @@
+import numpy as np
+import pandas as pd
+import pydicom
+import os
+
+from datetime import datetime
+
+from math import ceil, floor, log
+import cv2
+
+import tensorflow as tf
+import tensorflow.keras as keras
+import keras as K
+
+
+def correct_dcm(dcm):
+    x = dcm.pixel_array + 1000
+    px_mode = 4096
+    x[x>=px_mode] = x[x>=px_mode] - px_mode
+    dcm.PixelData = x.tobytes()
+    dcm.RescaleIntercept = -1000
+
+def window_image(dcm, window_center, window_width):
+
+    if (dcm.BitsStored == 12) and (dcm.PixelRepresentation == 0) and (int(dcm.RescaleIntercept) > -100):
+        correct_dcm(dcm)
+
+    img = dcm.pixel_array * dcm.RescaleSlope + dcm.RescaleIntercept
+    img_min = window_center - window_width // 2
+    img_max = window_center + window_width // 2
+    img = np.clip(img, img_min, img_max)
+
+    return img
+
+def bsb_window(dcm):
+    brain_img = window_image(dcm, 40, 80)
+    subdural_img = window_image(dcm, 80, 200)
+    soft_img = window_image(dcm, 40, 380)
+
+    brain_img = (brain_img - 0) / 80
+    subdural_img = (subdural_img - (-20)) / 200
+    soft_img = (soft_img - (-150)) / 380
+    bsb_img = np.array([brain_img, subdural_img, soft_img]).transpose(1,2,0)
+
+    return bsb_img
+
+# %% [code]
+def window_with_correction(dcm, window_center, window_width):
+    if (dcm.BitsStored == 12) and (dcm.PixelRepresentation == 0) and (int(dcm.RescaleIntercept) > -100):
+        correct_dcm(dcm)
+    img = dcm.pixel_array * dcm.RescaleSlope + dcm.RescaleIntercept
+    img_min = window_center - window_width // 2
+    img_max = window_center + window_width // 2
+    img = np.clip(img, img_min, img_max)
+    return img
+
+def window_without_correction(dcm, window_center, window_width):
+    img = dcm.pixel_array * dcm.RescaleSlope + dcm.RescaleIntercept
+    img_min = window_center - window_width // 2
+    img_max = window_center + window_width // 2
+    img = np.clip(img, img_min, img_max)
+    return img
+
+def window_testing(img, window):
+    brain_img = window(img, 40, 80)
+    subdural_img = window(img, 80, 200)
+    soft_img = window(img, 40, 380)
+
+    brain_img = (brain_img - 0) / 80
+    subdural_img = (subdural_img - (-20)) / 200
+    soft_img = (soft_img - (-150)) / 380
+    bsb_img = np.array([brain_img, subdural_img, soft_img]).transpose(1,2,0)
+
+    return bsb_img
+
+
+# %% [code]
+def _read(path, desired_size):
+    """Will be used in DataGenerator"""
+
+    dcm = pydicom.dcmread(path)
+
+    try:
+        img = bsb_window(dcm)
+    except:
+        img = np.zeros(desired_size)
+
+
+    img = cv2.resize(img, desired_size[:2], interpolation=cv2.INTER_LINEAR)
+
+    return img
+
+
+# %% [code]
+class DataGenerator(K.utils.Sequence):
+
+    def __init__(self, list_IDs, labels=None, batch_size=1, img_size=(512, 512, 1),
+                 img_dir="", *args, **kwargs):
+
+        self.list_IDs = list_IDs
+        self.labels = labels
+        self.batch_size = batch_size
+        self.img_size = img_size
+        self.img_dir = img_dir
+        self.on_epoch_end()
+
+    def __len__(self):
+        return int(ceil(len(self.indices) / self.batch_size))
+
+    def __getitem__(self, index):
+        indices = self.indices[index*self.batch_size:(index+1)*self.batch_size]
+        list_IDs_temp = [self.list_IDs[k] for k in indices]
+
+        if self.labels is not None:
+            X, Y = self.__data_generation(list_IDs_temp)
+            return X, Y
+        else:
+            X = self.__data_generation(list_IDs_temp)
+            return X
+
+    def on_epoch_end(self):
+
+        if self.labels is not None: # for training phase we undersample and shuffle
+            # keep probability of any=0 and any=1
+            keep_prob = self.labels.iloc[:, 0].map({0: 0.35, 1: 0.5})
+            keep = (keep_prob > np.random.rand(len(keep_prob)))
+            self.indices = np.arange(len(self.list_IDs))[keep]
+            np.random.shuffle(self.indices)
+        else:
+            self.indices = np.arange(len(self.list_IDs))
+
+    def __data_generation(self, list_IDs_temp):
+        X = np.empty((self.batch_size, *self.img_size))
+
+        if self.labels is not None: # training phase
+            Y = np.empty((self.batch_size, 6), dtype=np.float32)
+
+            for i, ID in enumerate(list_IDs_temp):
+                X[i,] = _read(self.img_dir+ID+".dcm", self.img_size)
+                Y[i,] = self.labels.loc[ID].values
+
+            return X, Y
+
+        else: # test phase
+            for i, ID in enumerate(list_IDs_temp):
+                X[i,] = _read(self.img_dir+ID+".dcm", self.img_size)
+
+            return X
+
+
+def read_testset(filename="../../stage_1_sample_submission.csv"):
+    df = pd.read_csv(filename)
+    df["Image"] = df["ID"].str.slice(stop=12)
+    df["Diagnosis"] = df["ID"].str.slice(start=13)
+
+    df = df.loc[:, ["Label", "Diagnosis", "Image"]]
+    df = df.set_index(['Image', 'Diagnosis']).unstack(level=-1)
+
+    return df
+
+def read_trainset(filename="../../data/stage_1_train.csv"):
+    df = pd.read_csv(filename)
+    df["Image"] = df["ID"].str.slice(stop=12)
+    df["Diagnosis"] = df["ID"].str.slice(start=13)
+
+    duplicates_to_remove = [
+        1598538, 1598539, 1598540, 1598541, 1598542, 1598543,
+        312468,  312469,  312470,  312471,  312472,  312473,
+        2708700, 2708701, 2708702, 2708703, 2708704, 2708705,
+        3032994, 3032995, 3032996, 3032997, 3032998, 3032999
+    ]
+
+    df = df.drop(index=duplicates_to_remove)
+    df = df.reset_index(drop=True)
+
+    df = df.loc[:, ["Label", "Diagnosis", "Image"]]
+    df = df.set_index(['Image', 'Diagnosis']).unstack(level=-1)
+
+    return df