Diff of /data_prep.py [000000] .. [77dc1e]

Switch to unified view

a b/data_prep.py
1
import numpy as np
2
import pandas as pd
3
from PIL import Image, ImageFile
4
from scipy import ndimage
5
import pydicom
6
import os
7
from tqdm import tqdm
8
from time import time
9
ImageFile.LOAD_TRUNCATED_IMAGES = True
10
11
12
data_path = "/mnt/storage_dimm2/kaggle_data/rsna-intracranial-hemorrhage-detection/"
13
14
15
def get_metadata(image_dir):
16
17
    labels = [
18
        'BitsAllocated', 'BitsStored', 'Columns', 'HighBit',
19
        'ImageOrientationPatient_0', 'ImageOrientationPatient_1', 'ImageOrientationPatient_2',
20
        'ImageOrientationPatient_3', 'ImageOrientationPatient_4', 'ImageOrientationPatient_5',
21
        'ImagePositionPatient_0', 'ImagePositionPatient_1', 'ImagePositionPatient_2',
22
        'Modality', 'PatientID', 'PhotometricInterpretation', 'PixelRepresentation',
23
        'PixelSpacing_0', 'PixelSpacing_1', 'RescaleIntercept', 'RescaleSlope', 'Rows', 'SOPInstanceUID',
24
        'SamplesPerPixel', 'SeriesInstanceUID', 'StudyID', 'StudyInstanceUID',
25
        'WindowCenter', 'WindowWidth', 'Image',
26
    ]
27
28
    data = {l: [] for l in labels}
29
30
    for image in tqdm(os.listdir(image_dir)):
31
        data["Image"].append(image[:-4])
32
33
        ds = pydicom.dcmread(os.path.join(image_dir, image))
34
35
        for metadata in ds.dir():
36
            if metadata != "PixelData":
37
                metadata_values = getattr(ds, metadata)
38
                if type(metadata_values) == pydicom.multival.MultiValue and metadata not in ["WindowCenter", "WindowWidth"]:
39
                    for i, v in enumerate(metadata_values):
40
                        data[f"{metadata}_{i}"].append(v)
41
                else:
42
                    if type(metadata_values) == pydicom.multival.MultiValue and metadata in ["WindowCenter", "WindowWidth"]:
43
                        data[metadata].append(metadata_values[0])
44
                    else:
45
                        data[metadata].append(metadata_values)
46
47
    return pd.DataFrame(data).set_index("Image")
48
49
50
def build_triplets(metadata):
51
    metadata.sort_values(by="ImagePositionPatient_2", inplace=True, ascending=False)
52
    studies = metadata.groupby("StudyInstanceUID")
53
    triplets = []
54
55
    for study_name, study_df in tqdm(studies):
56
        padded_names = np.pad(study_df.index, (1, 1), 'edge')
57
58
        for i, img in enumerate(padded_names[1:-1]):
59
            t = [padded_names[i], img, padded_names[i + 2]]
60
            triplets.append(t)
61
62
    return pd.DataFrame(triplets, columns=["red", "green", "blue"])
63
64
65
class CropHead(object):
66
    def __init__(self, offset=10):
67
        """
68
        Crops the head by labelling the objects in an image and keeping the second largest object (the largest object
69
        is the background). This method removes most of the headrest
70
71
        Originally made as a image transform for use with PyTorch, but too slow to run on the fly :(
72
        :param offset: Pixel offset to apply to the crop so that it isn't too tight
73
        """
74
        self.offset = offset
75
76
    def crop_extents(self, img):
77
        try:
78
            if type(img) != np.array:
79
                img_array = np.array(img)
80
            else:
81
                img_array = img
82
83
            labeled_blobs, number_of_blobs = ndimage.label(img_array)
84
            blob_sizes = np.bincount(labeled_blobs.flatten())
85
            head_blob = labeled_blobs == np.argmax(blob_sizes[1:]) + 1  # The number of the head blob
86
            head_blob = np.max(head_blob, axis=-1)
87
88
            mask = head_blob == 0
89
            rows = np.flatnonzero((~mask).sum(axis=1))
90
            cols = np.flatnonzero((~mask).sum(axis=0))
91
92
            x_min = max([rows.min() - self.offset, 0])
93
            x_max = min([rows.max() + self.offset + 1, img_array.shape[0]])
94
            y_min = max([cols.min() - self.offset, 0])
95
            y_max = min([cols.max() + self.offset + 1, img_array.shape[1]])
96
97
            return x_min, x_max, y_min, y_max
98
        except ValueError:
99
            return 0, 0, -1, -1
100
101
    def __call__(self, img):
102
        """
103
        Crops a CT image to so that as much black area is removed as possible
104
        :param img: PIL image
105
        :return: Cropped image
106
        """
107
108
        x_min, x_max, y_min, y_max = self.crop_extents(img)
109
110
        try:
111
            if type(img) != np.array:
112
                img_array = np.array(img)
113
            else:
114
                img_array = img
115
116
            return Image.fromarray(np.uint8(img_array[x_min:x_max, y_min:y_max]))
117
        except ValueError:
118
            return img
119
120
    def __repr__(self):
121
        return self.__class__.__name__ + '(offset={})'.format(self.offset)
122
123
124
def prepare_dicom(dcm, default_window=False):
125
    """
126
    Converts a DICOM object to a 16-bit Numpy array (in Housnfield units) or a uint8 image if the default window is used
127
    :param dcm: DICOM Object
128
    :param default_window: Flag to use the window settings specified in the metadata
129
    :return: Numpy array in either int16 or uint8
130
    """
131
132
    try:
133
        # https://www.kaggle.com/jhoward/cleaning-the-data-for-rapid-prototyping-fastai
134
        if dcm.BitsStored == 12 and dcm.PixelRepresentation == 0 and dcm.RescaleIntercept > -100:
135
            x = dcm.pixel_array + 1000
136
            px_mode = 4096
137
            x[x >= px_mode] = x[x >= px_mode] - px_mode
138
            dcm.PixelData = x.tobytes()
139
            dcm.RescaleIntercept = -1000
140
141
        pixels = dcm.pixel_array.astype(np.float32) * dcm.RescaleSlope + dcm.RescaleIntercept
142
    except ValueError as e:
143
        print("ValueError with", dcm.SOPInstanceUID, e)
144
        return np.zeros((512, 512))
145
146
    # Pad the image if it isn't square
147
    if pixels.shape[0] != pixels.shape[1]:
148
        (a, b) = pixels.shape
149
        if a > b:
150
            padding = ((0, 0), ((a - b) // 2, (a - b) // 2))
151
        else:
152
            padding = (((b - a) // 2, (b - a) // 2), (0, 0))
153
        pixels = np.pad(pixels, padding, mode='constant', constant_values=0)
154
155
    # Return image windows as per the metadata parameters
156
    if default_window:
157
        width = dcm.WindowWidth
158
        if type(width) != pydicom.valuerep.DSfloat:
159
            width = width[0]
160
161
        level = dcm.WindowCenter
162
        if type(level) != pydicom.valuerep.DSfloat:
163
            level = level[0]
164
165
        img_windowed = linear_windowing(pixels, level, width)
166
        return img_windowed
167
    # Return array Hounsfield units only
168
    else:
169
        return pixels.astype(np.int16)
170
171
172
def prepare_png(dataset, folder_name, channels=(0, 1, 2), crop=False):
173
    """
174
    Create PNG images using 3 specified window settings
175
    :param dataset: One of "train", "test_stage_1" or "test_stage_2"
176
    :param folder_name: Name of the output folder
177
    :param channels: Tuple to specifiy what windows to use for RGB channels
178
    :param crop: Flag to crop image to only the head
179
    :return:
180
    """
181
182
    start = time()
183
184
    image_dirs = {
185
        "train": os.path.join(data_path, "stage_1_train_images"),
186
        "test_stage_1": os.path.join(data_path, "stage_1_test_images"),
187
        "test_stage_2": os.path.join(data_path, "stage_2_test_images")
188
    }
189
190
    windows = [
191
        (None, None),  # No windowing
192
        (80, 40),  # Brain
193
        (200, 80),  # Subdural
194
        (40, 40),  # Stroke
195
        (2800, 600),  # Temporal bone
196
        (380, 40),  # Soft tissue
197
        (2000, 600),  # Bone
198
    ]
199
200
    output_path = os.path.join(data_path, "png", dataset, f"{folder_name}")
201
    crop_head = CropHead()
202
203
    if not os.path.exists(output_path):
204
        os.makedirs(output_path)
205
206
    for image_name in tqdm(os.listdir(image_dirs[dataset])):
207
        ds = pydicom.dcmread(os.path.join(image_dirs[dataset], image_name))
208
209
        rgb = []
210
        for c in channels:
211
            if c == 0:
212
                ch = prepare_dicom(ds, default_window=True)
213
            else:
214
                ch = prepare_dicom(ds)
215
                ch = linear_windowing(ch, windows[c][0], windows[c][1])
216
            rgb.append(ch)
217
218
        img = np.stack(rgb, -1)
219
220
        if crop:
221
            x_min, x_max, y_min, y_max = crop_head.crop_extents(img > 0)
222
            img = img[x_min:x_max, y_min:y_max]
223
224
            if img.shape[0] == 0 or img.shape[1] == 0:
225
                img = np.zeros(shape=(512, 512, 3), dtype=np.uint8)
226
227
        im = Image.fromarray(img.astype(np.uint8))
228
        im.save(os.path.join(output_path, image_name[:-4] + ".png"))
229
230
    print("Done in", (time() - start) // 60, "minutes")
231
232
233
def prepare_png_adjacent(dataset, folder_name, crop=True):
234
    """
235
    Prepare 3 channel adjacent images in Hounsfield units clipped between 0-255 HU
236
    The target image is the green channel. The reg and blue channels are spatially adjacent slices
237
    :param dataset: One of "train", "test_stage_1" or "test_stage_2"
238
    :param folder_name: Name of the output folder
239
    :param crop: Flag to crop image to only the head
240
    """
241
    start = time()
242
243
    triplet_dfs = {
244
        "train": os.path.join(data_path, "train_triplets.csv"),
245
        "test_stage_1": os.path.join(data_path, "stage_1_test_triplets.csv"),
246
        "test_stage_2": os.path.join(data_path, "stage_2_test_triplets.csv")
247
    }
248
249
    image_dirs = {
250
        "train": os.path.join(data_path, "stage_1_train_images"),
251
        "test_stage_1": os.path.join(data_path, "stage_1_test_images"),
252
        "test_stage_2": os.path.join(data_path, "stage_2_test_images")
253
    }
254
255
    output_path = os.path.join(data_path, "png", dataset, f"{folder_name}")
256
257
    if not os.path.exists(output_path):
258
        os.makedirs(output_path)
259
260
    triplets = pd.read_csv(triplet_dfs[dataset])
261
    crop_head = CropHead()
262
263
    for _, row in tqdm(triplets.iterrows(), total=len(triplets), desc=dataset):
264
265
        rgb = []
266
        for ch in ["red", "green", "blue"]:
267
            dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row[ch] + ".dcm"))
268
            rgb.append(prepare_dicom(dcm))
269
270
        img = np.stack(rgb, -1)
271
        img = np.clip(img, 0, 255).astype(np.uint8)
272
273
        if crop:
274
            x_min, x_max, y_min, y_max = crop_head.crop_extents(img > 0)
275
            img = img[x_min:x_max, y_min:y_max]
276
277
            if img.shape[0] == 0 or img.shape[1] == 0:
278
                img = np.zeros(shape=(512, 512, 3), dtype=np.uint8)
279
280
        im = Image.fromarray(img)
281
        im.save(os.path.join(output_path, row["green"] + ".png"))
282
283
    print("Done in", (time() - start) // 60, "minutes")
284
285
286
def dicom_to_npy(dataset, folder_name):
287
    """
288
    Saves DICOM images as 16-bit Numpy arrays
289
    :param dataset: One of "train", "test_stage_1" or "test_stage_2"
290
    :param folder_name: Name of the output folder
291
    """
292
293
    image_dirs = {
294
        "train": os.path.join(data_path, "stage_1_train_images"),
295
        "test_stage_1": os.path.join(data_path, "stage_1_test_images"),
296
        "test_stage_2": os.path.join(data_path, "stage_2_test_images")
297
    }
298
299
    output_path = os.path.join(data_path, "npy", dataset, f"{folder_name}")
300
    print("Saving slices to", output_path)
301
302
    if not os.path.exists(output_path):
303
        os.makedirs(output_path)
304
305
    for image_name in tqdm(os.listdir(image_dirs[dataset])):
306
        dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], image_name))
307
        np.save(os.path.join(output_path, image_name[:-4]), prepare_dicom(dcm))
308
309
310
def prepare_npy_adjacent(dataset, folder_name, crop=True):
311
    """
312
    Prepare 3 channel adjacent images in Hounsfield units (unclipped)
313
    :param dataset: One of "train", "test_stage_1" or "test_stage_2"
314
    :param folder_name: Name of the output folder
315
    :param crop: Flag to crop image to only the head
316
    """
317
    start = time()
318
319
    triplet_dfs = {
320
        "train": os.path.join(data_path, "train_triplets.csv"),
321
        "test_stage_1": os.path.join(data_path, "stage_1_test_triplets.csv"),
322
        "test_stage_2": os.path.join(data_path, "stage_2_test_triplets.csv")
323
    }
324
325
    image_dirs = {
326
        "train": os.path.join(data_path, "npy", dataset, "single_hu_slices"),
327
        "test_stage_1": os.path.join(data_path, "npy", dataset, "single_hu_slices"),
328
        "test_stage_2": os.path.join(data_path, "npy", dataset, "single_hu_slices")
329
    }
330
331
    output_path = os.path.join(data_path, "npy", dataset, f"{folder_name}")
332
333
    if not os.path.exists(output_path):
334
        os.makedirs(output_path)
335
336
    triplets = pd.read_csv(triplet_dfs[dataset])
337
    crop_head = CropHead()
338
339
    for _, row in tqdm(triplets.iterrows(), total=len(triplets), desc=dataset):
340
        r = np.load(os.path.join(image_dirs[dataset], row["red"] + ".npy"))
341
        g = np.load(os.path.join(image_dirs[dataset], row["green"] + ".npy"))
342
        b = np.load(os.path.join(image_dirs[dataset], row["blue"] + ".npy"))
343
        img = np.stack([r, g, b], -1)
344
345
        if crop:
346
            x_min, x_max, y_min, y_max = crop_head.crop_extents(img > 0)
347
            img = img[x_min:x_max, y_min:y_max]
348
349
            if img.shape[0] == 0 or img.shape[1] == 0:
350
                img = np.zeros(shape=(512, 512, 3), dtype=np.int16)
351
352
        np.save(os.path.join(output_path, row["green"]), img)
353
354
    print("Done in", (time() - start) // 60, "minutes")
355
356
357
def sigmoid(x):
358
    return 1 / (1 + np.exp(-x))
359
360
361
def linear_windowing(img, window_width, window_length):
362
    """
363
    Applies a linear window on an array
364
    :param img: Image array (in Hounsfield units)
365
    :param window_width:
366
    :param window_length:
367
    :return:
368
    """
369
    if window_width and window_length:
370
        lower = window_length - (window_width / 2)
371
        upper = window_length + (window_width / 2)
372
        img = np.clip(img, lower, upper)
373
        img = (img - lower) / (upper - lower)
374
        return (img*255).astype(np.uint8)
375
    else:
376
        return img
377
378
379
def sigmoid_windowing(img, window_width, window_length, u=255, epsilon=255):
380
    """
381
    Applies a sigmoid window on an array
382
    From Practical Window Setting Optimization for Medical Image Deep Learning https://arxiv.org/pdf/1812.00572.pdf
383
    :param img: Image array (in Hounsfield units)
384
    :param window_width:
385
    :param window_length:
386
    :param u:
387
    :param epsilon:
388
    :return:
389
    """
390
    if window_width and window_length:
391
        weight = (2 / window_width) * np.log((u / epsilon) - 1)
392
        bias = (-2 * window_length / window_width) * np.log((u / epsilon) - 1)
393
        img = u * sigmoid(weight * img + bias)
394
        return img.astype(np.uint8)
395
    else:
396
        return img
397
398
399
if __name__ == '__main__':
400
    # Generate metadata dataframes
401
    train_metadata = get_metadata(os.path.join(data_path, "stage_1_train_images"))
402
    test_metadata = get_metadata(os.path.join(data_path, "stage_1_test_images"))
403
    train_metadata.to_parquet(f'{data_path}/train_metadata.parquet.gzip', compression='gzip')
404
    test_metadata.to_parquet(f'{data_path}/stage_1_test_metadata.parquet.gzip', compression='gzip')
405
406
    # Build triplets of adjacent images
407
    train_triplets = build_triplets(train_metadata)
408
    test_triplets = build_triplets(test_metadata)
409
    train_triplets.to_csv(os.path.join(data_path, "train_triplets.csv"))
410
    test_triplets.to_csv(os.path.join(data_path, "stage_1_test_triplets.csv"))
411
412
    # Prepare adjacent images
413
    prepare_png_adjacent("train", "adjacent_hu_cropped")
414
    prepare_png_adjacent("test_stage_1", "adjacent_hu_cropped")
415
416
    # Prepare 3 window images (brain-subdural-bone)
417
    prepare_png("train", "brain-subdural-bone", channels=(1, 2, 6), crop=True)
418
    prepare_png("test_stage_1", "brain-subdural-bone", channels=(1, 2, 6), crop=True)
419
420
    # Stage 2 preparations
421
    test_metadata = get_metadata(os.path.join(data_path, "stage_2_test_images"))
422
    test_metadata.to_parquet(f'{data_path}/stage_2_test_metadata.parquet.gzip', compression='gzip')
423
    test_triplets = build_triplets(test_metadata)
424
    test_triplets.to_csv(os.path.join(data_path, "stage_2_test_triplets.csv"))
425
    prepare_png_adjacent("test_stage_2", "adjacent_hu_cropped")
426
    prepare_png("test_stage_2", "brain-subdural-bone", channels=(1, 2, 6), crop=True)