--- a +++ b/datasets.py @@ -0,0 +1,121 @@ +import numpy as np +import pandas as pd +from PIL import Image, ImageFile +import os +import torch +from torch.utils.data import Dataset, Sampler +import sys +sys.path.append("/home/anjum/PycharmProjects/kaggle") +# sys.path.append("/home/anjum/rsna_code") # GCP +from rsna_intracranial_hemorrhage_detection.data_prep import linear_windowing, sigmoid_windowing +ImageFile.LOAD_TRUNCATED_IMAGES = True + + +data_path = "/mnt/storage_dimm2/kaggle_data/rsna-intracranial-hemorrhage-detection/" +# data_path = "/home/anjum/rsna_data/" # GCP + + +class ICHDataset(Dataset): + def __init__(self, dataset, phase=1, image_filter=None, transforms=None, image_folder=None, png=True): + df_paths = { + "train": os.path.join(data_path, "stage_1_train.csv"), + "test1": os.path.join(data_path, "stage_1_sample_submission.csv"), + "test2": os.path.join(data_path, "stage_2_sample_submission.csv") + } + + self.png = png + + if self.png: + image_dirs = { + "train": os.path.join(data_path, "png", "train", image_folder), + "test1": os.path.join(data_path, "png", "test_stage_1", image_folder), + "test2": os.path.join(data_path, "png", "test_stage_2", image_folder) + } + else: + image_dirs = { + "train": os.path.join(data_path, "npy", "train", image_folder), + "test1": os.path.join(data_path, "npy", "test_stage_1", image_folder), + "test2": os.path.join(data_path, "npy", "test_stage_2", image_folder) + } + + self.dataset = dataset + self.phase = phase + self.transforms = transforms + self.image_dir = image_dirs[dataset] + + self.df = pd.read_csv(df_paths[dataset]).drop_duplicates() + self.df['ImageID'] = self.df['ID'].str.slice(stop=12) + self.df['Diagnosis'] = self.df['ID'].str.slice(start=13) + self.df_pivot = self.df.pivot(index="ImageID", columns="Diagnosis", values="Label") + + if image_filter is not None: + self.df_pivot = self.df_pivot.loc[image_filter] + + if self.phase == 0: + self.labels = self.df_pivot.values + elif self.phase == 1: + self.labels = self.df_pivot["any"].values.reshape(-1, 1) + else: + self.labels = self.df_pivot[["epidural", "intraparenchymal", + "intraventricular", "subarachnoid", "subdural"]].values + + self.image_ids = self.df_pivot.index.values + self.class_weights = np.mean(self.labels, axis=0) + + def load_image(self, image_name): + # window_width, window_length = 80, 40 # Brain window + window_width, window_length = 200, 80 # Subdural window + # window_width, window_length = 130, 50 # Subdural window + + if self.png: + img = np.array(Image.open(os.path.join(self.image_dir, image_name+".png")).convert("RGB")) + return linear_windowing(img, window_width, window_length) + # return sigmoid_windowing(img, window_width, window_length) + else: + img = np.load(os.path.join(self.image_dir, image_name+".npy")) + + # PIL doesn't work with 16-bit RGB images :( + # Should be ok though since the useful HU interval is between 0-255 + if img.shape[0] == 0 or img.shape[1] == 0: + return np.zeros(shape=(512, 512, 3), dtype=np.uint8) + else: + # return np.clip(img, 0, 255).astype(np.uint8) # Use this with the Windowing module + return linear_windowing(img, window_width, window_length) + # return sigmoid_windowing(img, window_width, window_length) + + def __getitem__(self, idx): + img_id = self.image_ids[idx] + img = self.load_image(img_id) + + if self.transforms is not None: + img = self.transforms(img) + + if self.dataset == "train": + return img, torch.tensor(self.labels[idx], dtype=torch.float32) + else: + return img, torch.tensor([0], dtype=torch.float32) + + def __len__(self): + return len(self.image_ids) + + +class BalancedRandomSampler(Sampler): + def __init__(self, data_source): + """ + Balances the negative and positive samples. All of the positive samples are used, but a random subset of + the negative samples are used to create a 50:50 dataset + :param data_source: An ICHDataset + """ + super().__init__(data_source) + self.labels = data_source.labels + self.ids_pos = np.where(self.labels[:, 0] == 1)[0] + self.ids_neg = np.where(self.labels[:, 0] == 0)[0] + + def __iter__(self): + ids_neg_sampled = np.random.choice(self.ids_neg, self.ids_pos.shape[0], replace=False) + ids = np.concatenate([self.ids_pos, ids_neg_sampled]) + np.random.shuffle(ids) + return iter(ids) + + def __len__(self): + return self.ids_pos.shape[0] * 2