|
a |
|
b/prepare_data.py |
|
|
1 |
import os |
|
|
2 |
import tables |
|
|
3 |
import numpy as np |
|
|
4 |
import nibabel as nib |
|
|
5 |
from tqdm import tqdm |
|
|
6 |
from glob import glob |
|
|
7 |
from config import cfg |
|
|
8 |
|
|
|
9 |
def read_brain(brain_dir, mode='train', x0=42, x1=194, y0=29, y1=221, z0=2, z1=146): |
|
|
10 |
|
|
|
11 |
""" |
|
|
12 |
A function that reads and crops a brain modalities (nii.gz format) |
|
|
13 |
|
|
|
14 |
Parameters |
|
|
15 |
---------- |
|
|
16 |
brain_dir : string |
|
|
17 |
The path to a folder that contains MRI modalities of a specific brain |
|
|
18 |
mode : string |
|
|
19 |
'train' or 'validation' mode. The default is 'train'. |
|
|
20 |
x0, x1, y0, y1, z0, z1 : int |
|
|
21 |
The coordinates to crop brain volumes. For example, a brain volume with the |
|
|
22 |
shape [x,y,z,modalites] is cropped [x0:x1, y0:y1, z0:z1, :] to have the shape |
|
|
23 |
[x1-x0, y1-y0, z1-z0, modalities]. One can calculate the x0,x1,... by calculating |
|
|
24 |
none zero pixels through dataset. Note that the final three shapes must be divisible |
|
|
25 |
by the network downscale rate. |
|
|
26 |
|
|
|
27 |
Returns |
|
|
28 |
------- |
|
|
29 |
all_modalities : array |
|
|
30 |
The cropped modalities (+ gt if mode='train') |
|
|
31 |
brain_affine : array |
|
|
32 |
The affine matrix of the input brain volume |
|
|
33 |
brain_name : str |
|
|
34 |
The name of the input brain volume |
|
|
35 |
|
|
|
36 |
""" |
|
|
37 |
|
|
|
38 |
brain_dir = os.path.normpath(brain_dir) |
|
|
39 |
flair = glob(os.path.join(brain_dir, '*_flair*.nii.gz')) |
|
|
40 |
t1 = glob(os.path.join(brain_dir, '*_t1*.nii.gz')) |
|
|
41 |
t1ce = glob(os.path.join(brain_dir, '*_t1ce*.nii.gz')) |
|
|
42 |
t2 = glob(os.path.join(brain_dir, '*_t2*.nii.gz')) |
|
|
43 |
|
|
|
44 |
if mode=='train': |
|
|
45 |
gt = glob( os.path.join(brain_dir, '*_seg*.nii.gz')) |
|
|
46 |
modalities_dir = [flair[0], t1[0], t1ce[0], t2[0], gt[0]] |
|
|
47 |
|
|
|
48 |
elif mode=='validation': |
|
|
49 |
modalities_dir = [flair[0], t1[0], t1ce[0], t2[0]] |
|
|
50 |
|
|
|
51 |
all_modalities = [] |
|
|
52 |
for modality in modalities_dir: |
|
|
53 |
nifti_file = nib.load(modality) |
|
|
54 |
brain_numpy = np.asarray(nifti_file.dataobj) |
|
|
55 |
all_modalities.append(brain_numpy) |
|
|
56 |
|
|
|
57 |
# all modalities have the same affine, so we take one of them (the last one in this case), |
|
|
58 |
# affine is just saved for preparing the predicted nii.gz file in the future. |
|
|
59 |
brain_affine = nifti_file.affine |
|
|
60 |
all_modalities = np.array(all_modalities) |
|
|
61 |
all_modalities = np.rint(all_modalities).astype(np.int16) |
|
|
62 |
all_modalities = all_modalities[:, x0:x1, y0:y1, z0:z1] |
|
|
63 |
# to fit keras channel last model |
|
|
64 |
all_modalities = np.transpose(all_modalities) |
|
|
65 |
# tumor grade + name |
|
|
66 |
brain_name = os.path.basename(os.path.split(brain_dir)[0]) + '_' + os.path.basename(brain_dir) |
|
|
67 |
|
|
|
68 |
return all_modalities, brain_affine, brain_name |
|
|
69 |
|
|
|
70 |
|
|
|
71 |
|
|
|
72 |
def create_table(dataset_dir, table_data_shape, save_dir, crop_coordinates, data_channels, k_fold=None): |
|
|
73 |
|
|
|
74 |
""" |
|
|
75 |
Reads and saves all brain volumes into a single table file. |
|
|
76 |
|
|
|
77 |
Parameters |
|
|
78 |
---------- |
|
|
79 |
dataset_dir : |
|
|
80 |
The path to all brain volumes (ex: suppose we have a folder 'BraTS2019' that |
|
|
81 |
contains two HGG and LGG folders each of which contains some folders so: |
|
|
82 |
dataset_dir="./BraTS2019/*/*") |
|
|
83 |
table_data_shape : tuple |
|
|
84 |
A tuple which shows the final brain volume shape in the table |
|
|
85 |
data_channels : int |
|
|
86 |
Number of data channels/modalities |
|
|
87 |
save_dir : str |
|
|
88 |
The path to save table. |
|
|
89 |
crop_coordinates : dict |
|
|
90 |
k_fold : int |
|
|
91 |
k-fold cross-validation |
|
|
92 |
if specified, k .npy files will be saved. Each of these files shows the indexes of |
|
|
93 |
brain volumes in that fold, which will be used for training the model. |
|
|
94 |
|
|
|
95 |
Returns |
|
|
96 |
------- |
|
|
97 |
None |
|
|
98 |
|
|
|
99 |
""" |
|
|
100 |
|
|
|
101 |
all_brains_dir = glob(dataset_dir) |
|
|
102 |
all_brains_dir.sort() |
|
|
103 |
|
|
|
104 |
hdf5_file = tables.open_file(os.path.join(save_dir + 'data.hdf5'), mode='w') |
|
|
105 |
filters = tables.Filters(complevel=5, complib='blosc') |
|
|
106 |
data_shape = tuple([0] + list(table_data_shape) + [data_channels]) |
|
|
107 |
truth_shape = tuple([0] + list(table_data_shape)) |
|
|
108 |
affine_shape = tuple([0] + [4, 4]) |
|
|
109 |
|
|
|
110 |
data_storage = hdf5_file.create_earray(hdf5_file.root, 'data', tables.UInt16Atom(), shape=data_shape, |
|
|
111 |
filters=filters, expectedrows=len(all_brains_dir)) |
|
|
112 |
truth_storage = hdf5_file.create_earray(hdf5_file.root, 'truth', tables.UInt8Atom(), shape=truth_shape, |
|
|
113 |
filters=filters, expectedrows=len(all_brains_dir)) |
|
|
114 |
affine_storage = hdf5_file.create_earray(hdf5_file.root, 'affine', tables.Float32Atom(), shape=affine_shape, |
|
|
115 |
filters=filters, expectedrows=len(all_brains_dir)) |
|
|
116 |
|
|
|
117 |
brain_names = [] |
|
|
118 |
for brain_dir in tqdm(all_brains_dir): |
|
|
119 |
all_modalities, brain_affine, brain_name = read_brain(brain_dir, mode='train', **crop_coordinates) |
|
|
120 |
brain = all_modalities[..., :4] |
|
|
121 |
gt = all_modalities[..., -1] |
|
|
122 |
|
|
|
123 |
# in BraTS 2017, 2018, 2019 there is no '3' label! |
|
|
124 |
gt[gt==4] = 3 |
|
|
125 |
brain_names.append(brain_name) |
|
|
126 |
data_storage.append(brain[np.newaxis,...]) |
|
|
127 |
truth_storage.append(gt[np.newaxis,...]) |
|
|
128 |
affine_storage.append(brain_affine[np.newaxis,...]) |
|
|
129 |
|
|
|
130 |
hdf5_file.create_array(hdf5_file.root, 'brain_names', obj=brain_names) |
|
|
131 |
hdf5_file.close() |
|
|
132 |
|
|
|
133 |
if k_fold: |
|
|
134 |
validation_split = (1/k_fold) # this equal to 5-fold validation |
|
|
135 |
all_HGG_names = [i for i in brain_names if 'HGG' in i] |
|
|
136 |
all_LGG_names = [i for i in brain_names if 'LGG' in i] |
|
|
137 |
|
|
|
138 |
np.random.seed(100) |
|
|
139 |
np.random.shuffle(all_HGG_names) |
|
|
140 |
np.random.shuffle(all_LGG_names) |
|
|
141 |
|
|
|
142 |
HGG_val_size = int(validation_split * len(all_HGG_names)) |
|
|
143 |
LGG_val_size = int(validation_split * len(all_LGG_names)) |
|
|
144 |
|
|
|
145 |
for fold in range(k_fold): |
|
|
146 |
chosen_HGG_val = all_HGG_names[fold*HGG_val_size:(fold+1)*HGG_val_size] |
|
|
147 |
chosen_LGG_val = all_LGG_names[fold*LGG_val_size:(fold+1)*LGG_val_size] |
|
|
148 |
|
|
|
149 |
chosen_HGG_train = [i for i in all_HGG_names if i not in chosen_HGG_val] |
|
|
150 |
chosen_LGG_train = [i for i in all_LGG_names if i not in chosen_LGG_val] |
|
|
151 |
|
|
|
152 |
# saving train_idx is enough |
|
|
153 |
train = chosen_HGG_train + chosen_LGG_train |
|
|
154 |
train_idx = [brain_names.index(i) for i in train] |
|
|
155 |
train_idx.sort() |
|
|
156 |
|
|
|
157 |
np.save(os.path.join(save_dir, 'fold{}_idx.npy'.format(fold)), train_idx) |
|
|
158 |
|
|
|
159 |
|
|
|
160 |
|
|
|
161 |
if __name__ == '__main__': |
|
|
162 |
|
|
|
163 |
create_table(cfg['data_dir'], cfg['table_data_shape'], cfg['save_data_dir'], |
|
|
164 |
cfg['crop_coord'], cfg['data_channels'], cfg['k_fold']) |
|
|
165 |
|
|
|
166 |
|