[95f789]: / src / preprocessing2.py

Download this file

153 lines (121 with data), 5.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import numpy as np
import pandas as pd
from PIL import Image, ImageFile
from scipy import ndimage
import pydicom
import os
from tqdm import tqdm
from time import time
from joblib import Parallel, delayed
ImageFile.LOAD_TRUNCATED_IMAGES = True
data_path = "/home/lab/bac/kaggle_data/rsna/"
class CropHead(object):
def __init__(self, offset=5):
"""
Originally made as a image transform, but too slow to run on the fly :(
:param offset: Pixel offset to apply to the crop so that it isn't too tight
"""
self.offset = offset
def __call__(self, img):
"""
Crops a CT image to so that as much black area is removed as possible
:param img: PIL image
:return: Cropped image
"""
try:
if type(img) != np.array:
img_array = np.array(img)
labeled_blobs, number_of_blobs = ndimage.label(img_array)
blob_sizes = np.bincount(labeled_blobs.flatten())
head_blob = labeled_blobs == np.argmax(blob_sizes[1:]) + 1 # The number of the head blob
head_blob = np.max(head_blob, axis=-1)
mask = head_blob == 0
rows = np.flatnonzero((~mask).sum(axis=1))
cols = np.flatnonzero((~mask).sum(axis=0))
x_min = max([rows.min() - self.offset, 0])
x_max = min([rows.max() + self.offset + 1, img_array.shape[0]])
y_min = max([cols.min() - self.offset, 0])
y_max = min([cols.max() + self.offset + 1, img_array.shape[1]])
return Image.fromarray(np.uint8(img_array[x_min:x_max, y_min:y_max]))
except ValueError:
return img
def __repr__(self):
return self.__class__.__name__ + '(offset={})'.format(self.offset)
def window_img(dcm, width=None, level=None, norm=True):
try:
pixels = dcm.pixel_array.astype(np.float32) * dcm.RescaleSlope + dcm.RescaleIntercept
except ValueError:
return np.zeros((512, 512))
# Pad the image if it isn't square
if pixels.shape[0] != pixels.shape[1]:
(a, b) = pixels.shape
if a > b:
padding = ((0, 0), ((a - b) // 2, (a - b) // 2))
else:
padding = (((b - a) // 2, (b - a) // 2), (0, 0))
pixels = np.pad(pixels, padding, mode='constant', constant_values=0)
if not width:
width = dcm.WindowWidth
if type(width) != pydicom.valuerep.DSfloat:
width = width[0]
if not level:
level = dcm.WindowCenter
if type(level) != pydicom.valuerep.DSfloat:
level = level[0]
lower = level - (width / 2)
upper = level + (width / 2)
img = np.clip(pixels, lower, upper)
if norm:
return (img - lower) / (upper - lower)
else:
return img
def dcm_to_png(row, image_dirs, dataset, width, level, crop, crop_head, output_path):
r_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["red"] + ".dcm"))
g_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["green"] + ".dcm"))
b_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["blue"] + ".dcm"))
r = window_img(r_dcm, width, level)
g = window_img(g_dcm, width, level)
b = window_img(b_dcm, width, level)
img = np.stack([r, g, b], -1)
img = (img * 255).astype(np.uint8)
im = Image.fromarray(img)
if crop:
im = crop_head(im)
im.save(os.path.join(output_path, row["green"] + ".png"))
def prepare_png_images(dataset, folder_name, width=None, level=None, crop=True):
start = time()
triplet_dfs = {
"train": os.path.join(data_path, "train_triplets.csv"),
"test_stage_1": os.path.join(data_path, "stage_1_test_triplets.csv")
}
image_dirs = {
"train": os.path.join(data_path, "stage_1_train_images"),
"test_stage_1": os.path.join(data_path, "stage_1_test_images")
}
output_path = os.path.join(data_path, "png", dataset, f"{folder_name}")
if not os.path.exists(output_path):
os.makedirs(output_path)
triplets = pd.read_csv(triplet_dfs[dataset])
crop_head = CropHead()
Parallel(n_jobs=4)(delayed(dcm_to_png)(row, image_dirs, dataset, width, level, crop, crop_head, output_path) for _, row in tqdm(
tqdm(triplets.iterrows(), total=len(triplets), desc=dataset)
))
# for _, row in tqdm(triplets.iterrows(), total=len(triplets), desc=dataset):
# r_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["red"] + ".dcm"))
# g_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["green"] + ".dcm"))
# b_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["blue"] + ".dcm"))
# r = window_img(r_dcm, width, level)
# g = window_img(g_dcm, width, level)
# b = window_img(b_dcm, width, level)
# img = np.stack([r, g, b], -1)
# img = (img * 255).astype(np.uint8)
# im = Image.fromarray(img)
#
# if crop:
# im = crop_head(im)
#
# im.save(os.path.join(output_path, row["green"] + ".png"))
print("Done in", (time() - start) // 60, "minutes")
if __name__ == '__main__':
prepare_png_images("train", "adjacent-brain-cropped", 80, 40, crop=True)
# prepare_png_images("test_stage_1", "adjacent-brain-cropped", 80, 40, crop=True)