Diff of /model/data_loader.py [000000] .. [39fb2b]

Switch to side-by-side view

--- a
+++ b/model/data_loader.py
@@ -0,0 +1,278 @@
+import random
+import os
+
+from PIL import Image
+import torch
+from torch.utils.data import Dataset, DataLoader
+import torchvision.transforms as transforms
+
+import pandas as pd
+import re
+import numpy as np
+import utils
+
+# borrowed from http://pytorch.org/tutorials/advanced/neural_style_tutorial.html
+# and http://pytorch.org/tutorials/beginner/data_loading_tutorial.html
+# define a training image loader that specifies transforms on images. See documentation for more details.
+
+def get_tfms_3d(split, params):
+    if split == "train":
+        def tfms(x):
+            # x = random_3d_crop(x, params.n_crop_vox)
+            
+            x = normalize(x, params)
+            x = random_crop(x, params.n_crop_vox)
+            # batchgenerators transforms expect bath dim and channel dim
+            # add these and squeeze off later
+            x = np.expand_dims(np.expand_dims(x, 0), 0)
+            # x = transforms3d.spatial_transforms.augment_mirroring(x)[0]
+            x = np.squeeze(x)
+
+            return x
+    else:
+        def tfms(x):
+            x = normalize(x, params)
+            x = unpad(x, int(params.n_crop_vox/2))
+            return x
+    
+    return tfms
+
+def get_tfms(split = "train", size = 51):
+    if split == "train":
+        tfms = transforms.Compose([
+            # transforms.CenterCrop(70),
+            transforms.RandomCrop(size),
+            # transforms.Resize((size, size)),
+            transforms.RandomHorizontalFlip(),
+            transforms.RandomVerticalFlip(),
+            # transforms.RandomRotation(90),
+            # transforms.Resize((224, 224)),
+            # transforms.RandomResizedCrop(size, scale = (.9, 1)),
+            # transforms.RandomRotation(12),
+            # transforms.Resize((224, 224)),  # resize the image to 64x64 (remove if images are already 64x64),
+            # transforms.RandomHorizontalFlip(),  # randomly flip image horizontally
+            # transforms.RandomAffine(10, translate=(.1, .1), scale=(.1, .1), shear=.1, resample=False, fillcolor=0),
+            transforms.ToTensor()
+            # normalize_2d
+            ])  # transform it into a torch tensor
+    
+    else:
+        tfms = transforms.Compose([
+            transforms.CenterCrop(size),
+            # transforms.Resize((size, size)),
+            transforms.ToTensor()
+            # normalize_2d
+            ])
+    
+    return tfms
+
+def normalize_2d(x):
+    return x / 255
+
+def normalize(x, params=None):
+    if params is None:
+        MIN_BOUND = -1000.; MAX_BOUND = 600.0; PIXEL_MEAN = .25
+    else:
+        MIN_BOUND = params.hu_min; MAX_BOUND = params.hu_max; PIXEL_MEAN = params.pix_mean
+    x = (x - MIN_BOUND) / (MAX_BOUND - MIN_BOUND)
+    x[x > (1 - PIXEL_MEAN)] = 1.
+    x[x < (0 - PIXEL_MEAN)] - 0.
+    return x
+
+def random_crop(x, num_vox=3):
+    starts = np.random.choice(range(num_vox), replace=True, size=(x.ndim,))
+    ends = x.shape - (num_vox - starts)
+    for i in range(x.ndim):
+        x = x.take(indices=range(starts[i],ends[i]), axis=i)
+    return x
+
+def unpad(x, n=2):
+    """
+    Skim off n-entries in 3 dimensions
+    """
+    assert type(x) is np.ndarray
+    if n>0:
+        x = x[n:-n,n:-n,n:-n]
+    return x
+
+class LIDCDataset(Dataset):
+    """
+    A standard PyTorch definition of Dataset which defines the functions __len__ and __getitem__.
+    """
+    def __init__(self, data_dir, transform, df, setting, params):
+        """
+        Store the filenames of the jpgs to use. Specifies transforms to apply on images.
+
+        Args:
+            data_dir: (string) directory containing the dataset
+            transform: (torchvision.transforms) transformation to apply on image
+        """
+        self.setting = setting
+        self.params = params
+        self.data_dir = data_dir
+        self.transform = transform
+        self.df = df
+        self.mode3d = setting.mode3d
+        self.covar_mode = setting.covar_mode
+        self.fase = setting.fase
+
+        # print(df.head())
+        # print(df.dtypes)
+
+        assert ("name" in df.columns)
+
+        self.name_col  = df.columns.get_loc("name")
+        self.label_col = df.columns.get_loc(setting.outcome[0])
+        self.data_cols = list(set(range(len(self.df.columns))) - 
+                                set([self.name_col, self.label_col]))
+
+        # split of data, which contains covariate data that is not name or label
+        if self.covar_mode:
+            self.data = self.df.loc[:,"t"].values
+        # if len(self.data_cols) > 0:
+        #     self.data = self.df.iloc[:,self.data_cols]
+        df['x_true'] = df.x
+
+        # calculate a transformation of x to assess robustness of method to different measurements
+        df['x'] = df.x + params.size_offset
+        if params.size_measurement == 'area':
+            pass
+        elif params.size_measurement == 'diameter':
+            df['x'] = df.x.values ** (1/2)
+        elif params.size_measurement == 'volume':
+            df['x'] = df.x.values ** (3/2)
+        else:
+            raise ValueError(f'dont know how to measure size in {params.size_measurement}, pick area, diameter or volume')
+        # renormalize x to make sure that whatever measurement is used, the MSE is comparable
+        df['x'] = (df.x - df.x.mean()) / df.x.std()
+
+
+    def __len__(self):
+        # return size of dataset
+        return self.df.shape[0]
+
+    def __getitem__(self, idx):
+        """
+        Fetch index idx image and labels from dataset. Perform transforms on image.
+
+        Args:
+            idx: (int) index in [0, 1, ..., size_of_dataset-1]
+
+        Returns:
+            image: (Tensor) transformed image
+            label: (int) corresponding label of image
+        """
+        # image = Image.open(self.fpath_dict[self.idx_to_id[idx]]).convert("RGB")  # PIL image
+        if self.mode3d:
+            image = np.load(os.path.join(self.data_dir, self.df.iloc[idx, self.name_col]))
+            image = image.astype(np.float32)
+            image = self.transform(image)
+            image = torch.from_numpy(image).unsqueeze(0)
+
+        else:
+            img_name = os.path.join(self.data_dir, 
+                                    self.df.iloc[idx, self.name_col])
+            image = Image.open(img_name).convert("L") # use rgb for resnet compatibility; L for grayscale
+            image = self.transform(image)
+
+        label = torch.from_numpy(np.array(self.df.iloc[idx, self.label_col], dtype = np.float32))
+
+        sample = {"image": image, 'label': label}
+        
+        for variable in ["x", "y", "z", "t", 'x_true']:
+            if variable in self.df.columns:
+                sample[variable] = self.df[variable].values[idx].astype(np.float32)
+
+        if self.setting.fase == "feature":
+            sample[self.setting.outcome[0]] = self.df[self.setting.outcome[0]].values[idx].astype(np.float32)
+
+        return sample
+
+def fetch_dataloader(args, params, setting, types = ["train"], df = None):
+    """
+    Fetches the DataLoader object for each type in types from data_dir.
+
+    Args:
+        types: (list) has one or more of 'train', 'val', 'test' depending on which data is required
+        data_dir: (string) directory containing the dataset
+        df: pandas dataframe containing at least name, label and split
+        params: (Params) hyperparameters
+
+    Returns:
+        data: (dict) contains the DataLoader object for each type in types
+    """
+    if setting.gen_model == "":
+        if setting.mode3d:
+            data_dir = "data"
+        else:
+            data_dir = "slices"
+    else:
+        data_dir = os.path.join(setting.home, "data")
+
+    if df is None:
+        df = pd.read_csv(os.path.join(data_dir, "labels.csv"))
+    dataloaders = {}
+
+    if not setting.mode3d:
+        pass
+        # print(df.name.tolist()[:5])
+        # df["name"] = df.apply(lambda x: os.path.join(x["split"], x["name"]), axis=1)
+        # print(df.name.tolist()[:5])
+
+    # make sure the dataframe has no index
+    df_cols = df.columns
+    df = df.reset_index()
+    df = df[df_cols]
+
+    try:
+        assert setting.outcome[0] in df.columns
+    except:
+        print(f"outcome {setting.outcome[0]} not in df.columns:")
+        print("\n".join(df.columns))
+        raise
+
+
+    if "split" in df.columns:
+        splits = [x for x in types if x in df.split.unique().tolist()]
+    else:
+        df["split"] = types[0]
+        splits = types
+
+    df_grp = df.groupby("split")
+
+    # for split in ['train', 'val', 'test']:
+    for split, df_split in df_grp:
+        df_split = df_split.drop("split", axis = 1)
+        if split in types:
+            # path = os.path.join(data_dir, split)
+            path = data_dir
+            if setting.mode3d:
+                tfms = get_tfms_3d(split, params)
+                # tfms = []
+            else:
+                tfms = get_tfms(split, params.size)
+
+            # use the train_transformer if training data, else use eval_transformer without random flip
+            if split == 'train':
+                dl = DataLoader(LIDCDataset(path, tfms, df_split, setting, params), 
+                                        shuffle=True,
+                                        num_workers=params.num_workers,
+                                        batch_size=params.batch_size, 
+                                        pin_memory=params.cuda)
+                                        # batch_size = batch_size,
+                                        # num_workers=2,
+                                        # pin_memory=True)
+            else:
+                # dl = DataLoader(SEGMENTATIONDataset(path, eval_transformer, df[df.split.isin([split])]), 
+                dl = DataLoader(LIDCDataset(path, tfms, df_split, setting, params), 
+                                batch_size=params.batch_size,
+                                num_workers=params.num_workers,
+                                shuffle=False,
+                                pin_memory=params.cuda)
+                                # batch_size = batch_size,
+                                # num_workers=2,
+                                # pin_memory=True)
+
+            dataloaders[split] = dl
+
+    return dataloaders