Diff of /prepare_data.py [000000] .. [7b3b92]

Switch to unified view

a b/prepare_data.py
1
import os
2
import tables
3
import numpy as np
4
import nibabel as nib
5
from tqdm import tqdm
6
from glob import glob
7
from config import cfg
8
9
def read_brain(brain_dir, mode='train', x0=42, x1=194, y0=29, y1=221, z0=2, z1=146):
10
11
    """
12
    A function that reads and crops a brain modalities (nii.gz format)
13
    
14
    Parameters
15
    ----------
16
    brain_dir : string
17
        The path to a folder that contains MRI modalities of a specific brain
18
    mode : string
19
        'train' or 'validation' mode. The default is 'train'.
20
    x0, x1, y0, y1, z0, z1 : int
21
        The coordinates to crop brain volumes. For example, a brain volume with the 
22
        shape [x,y,z,modalites] is cropped [x0:x1, y0:y1, z0:z1, :] to have the shape
23
        [x1-x0, y1-y0, z1-z0, modalities]. One can calculate the x0,x1,... by calculating
24
        none zero pixels through dataset. Note that the final three shapes must be divisible
25
        by the network downscale rate.
26
        
27
    Returns
28
    -------
29
    all_modalities : array
30
        The cropped modalities (+ gt if mode='train')
31
    brain_affine : array
32
        The affine matrix of the input brain volume
33
    brain_name : str
34
        The name of the input brain volume
35
36
    """
37
    
38
    brain_dir = os.path.normpath(brain_dir)
39
    flair     = glob(os.path.join(brain_dir, '*_flair*.nii.gz'))
40
    t1        = glob(os.path.join(brain_dir, '*_t1*.nii.gz'))
41
    t1ce      = glob(os.path.join(brain_dir, '*_t1ce*.nii.gz'))
42
    t2        = glob(os.path.join(brain_dir, '*_t2*.nii.gz'))
43
    
44
    if mode=='train':
45
        gt             = glob( os.path.join(brain_dir, '*_seg*.nii.gz'))
46
        modalities_dir = [flair[0], t1[0], t1ce[0], t2[0], gt[0]]
47
        
48
    elif mode=='validation':
49
        modalities_dir = [flair[0], t1[0], t1ce[0], t2[0]]   
50
    
51
    all_modalities = []    
52
    for modality in modalities_dir:      
53
        nifti_file   = nib.load(modality)
54
        brain_numpy  = np.asarray(nifti_file.dataobj)    
55
        all_modalities.append(brain_numpy)
56
        
57
    # all modalities have the same affine, so we take one of them (the last one in this case),
58
    # affine is just saved for preparing the predicted nii.gz file in the future.       
59
    brain_affine   = nifti_file.affine
60
    all_modalities = np.array(all_modalities)
61
    all_modalities = np.rint(all_modalities).astype(np.int16)
62
    all_modalities = all_modalities[:, x0:x1, y0:y1, z0:z1]
63
    # to fit keras channel last model
64
    all_modalities = np.transpose(all_modalities) 
65
    # tumor grade + name
66
    brain_name     = os.path.basename(os.path.split(brain_dir)[0]) + '_' + os.path.basename(brain_dir) 
67
68
    return all_modalities, brain_affine, brain_name
69
    
70
71
72
def create_table(dataset_dir, table_data_shape, save_dir, crop_coordinates, data_channels, k_fold=None):
73
    
74
    """
75
    Reads and saves all brain volumes into a single table file. 
76
    
77
    Parameters
78
    ----------
79
    dataset_dir : 
80
        The path to all brain volumes (ex: suppose we have a folder 'BraTS2019' that 
81
        contains two HGG and LGG folders each of which contains some folders so:
82
        dataset_dir="./BraTS2019/*/*")
83
    table_data_shape : tuple
84
        A tuple which shows the final brain volume shape in the table
85
    data_channels : int
86
        Number of data channels/modalities
87
    save_dir : str
88
        The path to save table.
89
    crop_coordinates : dict
90
    k_fold : int
91
        k-fold cross-validation
92
        if specified, k .npy files will be saved. Each of these files shows the indexes of 
93
        brain volumes in that fold, which will be used for training the model.
94
95
    Returns
96
    -------
97
    None
98
99
    """
100
    
101
    all_brains_dir = glob(dataset_dir)
102
    all_brains_dir.sort()
103
    
104
    hdf5_file    = tables.open_file(os.path.join(save_dir + 'data.hdf5'), mode='w')
105
    filters      = tables.Filters(complevel=5, complib='blosc')
106
    data_shape   = tuple([0] + list(table_data_shape) + [data_channels])
107
    truth_shape  = tuple([0] + list(table_data_shape))
108
    affine_shape = tuple([0] + [4, 4])
109
    
110
    data_storage   = hdf5_file.create_earray(hdf5_file.root, 'data', tables.UInt16Atom(), shape=data_shape,
111
                                               filters=filters, expectedrows=len(all_brains_dir))
112
    truth_storage  = hdf5_file.create_earray(hdf5_file.root, 'truth', tables.UInt8Atom(), shape=truth_shape,
113
                                                filters=filters, expectedrows=len(all_brains_dir))
114
    affine_storage = hdf5_file.create_earray(hdf5_file.root, 'affine', tables.Float32Atom(), shape=affine_shape,
115
                                                filters=filters, expectedrows=len(all_brains_dir))
116
     
117
    brain_names = []
118
    for brain_dir in tqdm(all_brains_dir):
119
        all_modalities, brain_affine, brain_name = read_brain(brain_dir, mode='train', **crop_coordinates)
120
        brain    = all_modalities[..., :4]
121
        gt       = all_modalities[..., -1]
122
        
123
        # in BraTS 2017, 2018, 2019 there is no '3' label!
124
        gt[gt==4]  = 3    
125
        brain_names.append(brain_name)   
126
        data_storage.append(brain[np.newaxis,...])
127
        truth_storage.append(gt[np.newaxis,...])
128
        affine_storage.append(brain_affine[np.newaxis,...])
129
        
130
    hdf5_file.create_array(hdf5_file.root, 'brain_names', obj=brain_names)
131
    hdf5_file.close()
132
         
133
    if k_fold:
134
        validation_split = (1/k_fold) # this equal to 5-fold validation
135
        all_HGG_names = [i for i in brain_names if 'HGG' in i]
136
        all_LGG_names = [i for i in brain_names if 'LGG' in i]
137
              
138
        np.random.seed(100)
139
        np.random.shuffle(all_HGG_names)
140
        np.random.shuffle(all_LGG_names)
141
              
142
        HGG_val_size = int(validation_split * len(all_HGG_names))
143
        LGG_val_size = int(validation_split * len(all_LGG_names))
144
        
145
        for fold in range(k_fold):
146
            chosen_HGG_val = all_HGG_names[fold*HGG_val_size:(fold+1)*HGG_val_size]
147
            chosen_LGG_val = all_LGG_names[fold*LGG_val_size:(fold+1)*LGG_val_size]
148
        
149
            chosen_HGG_train = [i for i in all_HGG_names if i not in chosen_HGG_val]
150
            chosen_LGG_train = [i for i in all_LGG_names if i not in chosen_LGG_val]
151
        
152
            # saving train_idx is enough
153
            train = chosen_HGG_train + chosen_LGG_train    
154
            train_idx = [brain_names.index(i) for i in train]    
155
            train_idx.sort()
156
        
157
            np.save(os.path.join(save_dir, 'fold{}_idx.npy'.format(fold)), train_idx)
158
    
159
   
160
    
161
if __name__ == '__main__':
162
      
163
    create_table(cfg['data_dir'], cfg['table_data_shape'], cfg['save_data_dir'], 
164
                 cfg['crop_coord'], cfg['data_channels'], cfg['k_fold'])
165
    
166