Diff of /src/preprocessing2.py [000000] .. [95f789]

Switch to unified view

a b/src/preprocessing2.py
1
import numpy as np
2
import pandas as pd
3
from PIL import Image, ImageFile
4
from scipy import ndimage
5
import pydicom
6
import os
7
from tqdm import tqdm
8
from time import time
9
from joblib import Parallel, delayed
10
ImageFile.LOAD_TRUNCATED_IMAGES = True
11
12
13
data_path = "/home/lab/bac/kaggle_data/rsna/"
14
15
16
class CropHead(object):
17
    def __init__(self, offset=5):
18
        """
19
        Originally made as a image transform, but too slow to run on the fly :(
20
        :param offset: Pixel offset to apply to the crop so that it isn't too tight
21
        """
22
        self.offset = offset
23
24
    def __call__(self, img):
25
        """
26
        Crops a CT image to so that as much black area is removed as possible
27
        :param img: PIL image
28
        :return: Cropped image
29
        """
30
        try:
31
            if type(img) != np.array:
32
                img_array = np.array(img)
33
34
            labeled_blobs, number_of_blobs = ndimage.label(img_array)
35
            blob_sizes = np.bincount(labeled_blobs.flatten())
36
            head_blob = labeled_blobs == np.argmax(blob_sizes[1:]) + 1  # The number of the head blob
37
            head_blob = np.max(head_blob, axis=-1)
38
39
            mask = head_blob == 0
40
            rows = np.flatnonzero((~mask).sum(axis=1))
41
            cols = np.flatnonzero((~mask).sum(axis=0))
42
43
            x_min = max([rows.min() - self.offset, 0])
44
            x_max = min([rows.max() + self.offset + 1, img_array.shape[0]])
45
            y_min = max([cols.min() - self.offset, 0])
46
            y_max = min([cols.max() + self.offset + 1, img_array.shape[1]])
47
48
            return Image.fromarray(np.uint8(img_array[x_min:x_max, y_min:y_max]))
49
        except ValueError:
50
            return img
51
52
    def __repr__(self):
53
        return self.__class__.__name__ + '(offset={})'.format(self.offset)
54
55
56
def window_img(dcm, width=None, level=None, norm=True):
57
    try:
58
        pixels = dcm.pixel_array.astype(np.float32) * dcm.RescaleSlope + dcm.RescaleIntercept
59
    except ValueError:
60
        return np.zeros((512, 512))
61
62
    # Pad the image if it isn't square
63
    if pixels.shape[0] != pixels.shape[1]:
64
        (a, b) = pixels.shape
65
        if a > b:
66
            padding = ((0, 0), ((a - b) // 2, (a - b) // 2))
67
        else:
68
            padding = (((b - a) // 2, (b - a) // 2), (0, 0))
69
        pixels = np.pad(pixels, padding, mode='constant', constant_values=0)
70
71
    if not width:
72
        width = dcm.WindowWidth
73
        if type(width) != pydicom.valuerep.DSfloat:
74
            width = width[0]
75
    if not level:
76
        level = dcm.WindowCenter
77
        if type(level) != pydicom.valuerep.DSfloat:
78
            level = level[0]
79
    lower = level - (width / 2)
80
    upper = level + (width / 2)
81
    img = np.clip(pixels, lower, upper)
82
83
    if norm:
84
        return (img - lower) / (upper - lower)
85
    else:
86
        return img
87
88
89
def dcm_to_png(row, image_dirs, dataset, width, level, crop, crop_head, output_path):
90
    r_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["red"] + ".dcm"))
91
    g_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["green"] + ".dcm"))
92
    b_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["blue"] + ".dcm"))
93
    r = window_img(r_dcm, width, level)
94
    g = window_img(g_dcm, width, level)
95
    b = window_img(b_dcm, width, level)
96
    img = np.stack([r, g, b], -1)
97
    img = (img * 255).astype(np.uint8)
98
    im = Image.fromarray(img)
99
100
    if crop:
101
        im = crop_head(im)
102
103
    im.save(os.path.join(output_path, row["green"] + ".png"))
104
105
106
def prepare_png_images(dataset, folder_name, width=None, level=None, crop=True):
107
    start = time()
108
109
    triplet_dfs = {
110
        "train": os.path.join(data_path, "train_triplets.csv"),
111
        "test_stage_1": os.path.join(data_path, "stage_1_test_triplets.csv")
112
    }
113
114
    image_dirs = {
115
        "train": os.path.join(data_path, "stage_1_train_images"),
116
        "test_stage_1": os.path.join(data_path, "stage_1_test_images")
117
    }
118
119
    output_path = os.path.join(data_path, "png", dataset, f"{folder_name}")
120
121
    if not os.path.exists(output_path):
122
        os.makedirs(output_path)
123
124
    triplets = pd.read_csv(triplet_dfs[dataset])
125
    crop_head = CropHead()
126
127
    Parallel(n_jobs=4)(delayed(dcm_to_png)(row, image_dirs, dataset, width, level, crop, crop_head, output_path) for _, row in tqdm(
128
        tqdm(triplets.iterrows(), total=len(triplets), desc=dataset)
129
    ))
130
131
    # for _, row in tqdm(triplets.iterrows(), total=len(triplets), desc=dataset):
132
    #     r_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["red"] + ".dcm"))
133
    #     g_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["green"] + ".dcm"))
134
    #     b_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["blue"] + ".dcm"))
135
    #     r = window_img(r_dcm, width, level)
136
    #     g = window_img(g_dcm, width, level)
137
    #     b = window_img(b_dcm, width, level)
138
    #     img = np.stack([r, g, b], -1)
139
    #     img = (img * 255).astype(np.uint8)
140
    #     im = Image.fromarray(img)
141
    #
142
    #     if crop:
143
    #         im = crop_head(im)
144
    #
145
    #     im.save(os.path.join(output_path, row["green"] + ".png"))
146
147
    print("Done in", (time() - start) // 60, "minutes")
148
149
150
if __name__ == '__main__':
151
    prepare_png_images("train", "adjacent-brain-cropped", 80, 40, crop=True)
152
    # prepare_png_images("test_stage_1", "adjacent-brain-cropped", 80, 40, crop=True)