Diff of /prepare_data.py [000000] .. [7b3b92]

Switch to side-by-side view

--- a
+++ b/prepare_data.py
@@ -0,0 +1,166 @@
+import os
+import tables
+import numpy as np
+import nibabel as nib
+from tqdm import tqdm
+from glob import glob
+from config import cfg
+
+def read_brain(brain_dir, mode='train', x0=42, x1=194, y0=29, y1=221, z0=2, z1=146):
+
+    """
+    A function that reads and crops a brain modalities (nii.gz format)
+    
+    Parameters
+    ----------
+    brain_dir : string
+        The path to a folder that contains MRI modalities of a specific brain
+    mode : string
+        'train' or 'validation' mode. The default is 'train'.
+    x0, x1, y0, y1, z0, z1 : int
+        The coordinates to crop brain volumes. For example, a brain volume with the 
+        shape [x,y,z,modalites] is cropped [x0:x1, y0:y1, z0:z1, :] to have the shape
+        [x1-x0, y1-y0, z1-z0, modalities]. One can calculate the x0,x1,... by calculating
+        none zero pixels through dataset. Note that the final three shapes must be divisible
+        by the network downscale rate.
+        
+    Returns
+    -------
+    all_modalities : array
+        The cropped modalities (+ gt if mode='train')
+    brain_affine : array
+        The affine matrix of the input brain volume
+    brain_name : str
+        The name of the input brain volume
+
+    """
+    
+    brain_dir = os.path.normpath(brain_dir)
+    flair     = glob(os.path.join(brain_dir, '*_flair*.nii.gz'))
+    t1        = glob(os.path.join(brain_dir, '*_t1*.nii.gz'))
+    t1ce      = glob(os.path.join(brain_dir, '*_t1ce*.nii.gz'))
+    t2        = glob(os.path.join(brain_dir, '*_t2*.nii.gz'))
+    
+    if mode=='train':
+        gt             = glob( os.path.join(brain_dir, '*_seg*.nii.gz'))
+        modalities_dir = [flair[0], t1[0], t1ce[0], t2[0], gt[0]]
+        
+    elif mode=='validation':
+        modalities_dir = [flair[0], t1[0], t1ce[0], t2[0]]   
+    
+    all_modalities = []    
+    for modality in modalities_dir:      
+        nifti_file   = nib.load(modality)
+        brain_numpy  = np.asarray(nifti_file.dataobj)    
+        all_modalities.append(brain_numpy)
+        
+    # all modalities have the same affine, so we take one of them (the last one in this case),
+    # affine is just saved for preparing the predicted nii.gz file in the future.       
+    brain_affine   = nifti_file.affine
+    all_modalities = np.array(all_modalities)
+    all_modalities = np.rint(all_modalities).astype(np.int16)
+    all_modalities = all_modalities[:, x0:x1, y0:y1, z0:z1]
+    # to fit keras channel last model
+    all_modalities = np.transpose(all_modalities) 
+    # tumor grade + name
+    brain_name     = os.path.basename(os.path.split(brain_dir)[0]) + '_' + os.path.basename(brain_dir) 
+
+    return all_modalities, brain_affine, brain_name
+    
+
+
+def create_table(dataset_dir, table_data_shape, save_dir, crop_coordinates, data_channels, k_fold=None):
+    
+    """
+    Reads and saves all brain volumes into a single table file. 
+    
+    Parameters
+    ----------
+    dataset_dir : 
+        The path to all brain volumes (ex: suppose we have a folder 'BraTS2019' that 
+        contains two HGG and LGG folders each of which contains some folders so:
+        dataset_dir="./BraTS2019/*/*")
+    table_data_shape : tuple
+        A tuple which shows the final brain volume shape in the table
+    data_channels : int
+        Number of data channels/modalities
+    save_dir : str
+        The path to save table.
+    crop_coordinates : dict
+    k_fold : int
+        k-fold cross-validation
+        if specified, k .npy files will be saved. Each of these files shows the indexes of 
+        brain volumes in that fold, which will be used for training the model.
+
+    Returns
+    -------
+    None
+
+    """
+    
+    all_brains_dir = glob(dataset_dir)
+    all_brains_dir.sort()
+    
+    hdf5_file    = tables.open_file(os.path.join(save_dir + 'data.hdf5'), mode='w')
+    filters      = tables.Filters(complevel=5, complib='blosc')
+    data_shape   = tuple([0] + list(table_data_shape) + [data_channels])
+    truth_shape  = tuple([0] + list(table_data_shape))
+    affine_shape = tuple([0] + [4, 4])
+    
+    data_storage   = hdf5_file.create_earray(hdf5_file.root, 'data', tables.UInt16Atom(), shape=data_shape,
+                                               filters=filters, expectedrows=len(all_brains_dir))
+    truth_storage  = hdf5_file.create_earray(hdf5_file.root, 'truth', tables.UInt8Atom(), shape=truth_shape,
+                                                filters=filters, expectedrows=len(all_brains_dir))
+    affine_storage = hdf5_file.create_earray(hdf5_file.root, 'affine', tables.Float32Atom(), shape=affine_shape,
+                                                filters=filters, expectedrows=len(all_brains_dir))
+     
+    brain_names = []
+    for brain_dir in tqdm(all_brains_dir):
+        all_modalities, brain_affine, brain_name = read_brain(brain_dir, mode='train', **crop_coordinates)
+        brain    = all_modalities[..., :4]
+        gt       = all_modalities[..., -1]
+        
+        # in BraTS 2017, 2018, 2019 there is no '3' label!
+        gt[gt==4]  = 3    
+        brain_names.append(brain_name)   
+        data_storage.append(brain[np.newaxis,...])
+        truth_storage.append(gt[np.newaxis,...])
+        affine_storage.append(brain_affine[np.newaxis,...])
+        
+    hdf5_file.create_array(hdf5_file.root, 'brain_names', obj=brain_names)
+    hdf5_file.close()
+         
+    if k_fold:
+        validation_split = (1/k_fold) # this equal to 5-fold validation
+        all_HGG_names = [i for i in brain_names if 'HGG' in i]
+        all_LGG_names = [i for i in brain_names if 'LGG' in i]
+              
+        np.random.seed(100)
+        np.random.shuffle(all_HGG_names)
+        np.random.shuffle(all_LGG_names)
+              
+        HGG_val_size = int(validation_split * len(all_HGG_names))
+        LGG_val_size = int(validation_split * len(all_LGG_names))
+        
+        for fold in range(k_fold):
+            chosen_HGG_val = all_HGG_names[fold*HGG_val_size:(fold+1)*HGG_val_size]
+            chosen_LGG_val = all_LGG_names[fold*LGG_val_size:(fold+1)*LGG_val_size]
+        
+            chosen_HGG_train = [i for i in all_HGG_names if i not in chosen_HGG_val]
+            chosen_LGG_train = [i for i in all_LGG_names if i not in chosen_LGG_val]
+        
+            # saving train_idx is enough
+            train = chosen_HGG_train + chosen_LGG_train    
+            train_idx = [brain_names.index(i) for i in train]    
+            train_idx.sort()
+        
+            np.save(os.path.join(save_dir, 'fold{}_idx.npy'.format(fold)), train_idx)
+    
+   
+    
+if __name__ == '__main__':
+      
+    create_table(cfg['data_dir'], cfg['table_data_shape'], cfg['save_data_dir'], 
+                 cfg['crop_coord'], cfg['data_channels'], cfg['k_fold'])
+    
+