--- a +++ b/src/preprocessing2.py @@ -0,0 +1,152 @@ +import numpy as np +import pandas as pd +from PIL import Image, ImageFile +from scipy import ndimage +import pydicom +import os +from tqdm import tqdm +from time import time +from joblib import Parallel, delayed +ImageFile.LOAD_TRUNCATED_IMAGES = True + + +data_path = "/home/lab/bac/kaggle_data/rsna/" + + +class CropHead(object): + def __init__(self, offset=5): + """ + Originally made as a image transform, but too slow to run on the fly :( + :param offset: Pixel offset to apply to the crop so that it isn't too tight + """ + self.offset = offset + + def __call__(self, img): + """ + Crops a CT image to so that as much black area is removed as possible + :param img: PIL image + :return: Cropped image + """ + try: + if type(img) != np.array: + img_array = np.array(img) + + labeled_blobs, number_of_blobs = ndimage.label(img_array) + blob_sizes = np.bincount(labeled_blobs.flatten()) + head_blob = labeled_blobs == np.argmax(blob_sizes[1:]) + 1 # The number of the head blob + head_blob = np.max(head_blob, axis=-1) + + mask = head_blob == 0 + rows = np.flatnonzero((~mask).sum(axis=1)) + cols = np.flatnonzero((~mask).sum(axis=0)) + + x_min = max([rows.min() - self.offset, 0]) + x_max = min([rows.max() + self.offset + 1, img_array.shape[0]]) + y_min = max([cols.min() - self.offset, 0]) + y_max = min([cols.max() + self.offset + 1, img_array.shape[1]]) + + return Image.fromarray(np.uint8(img_array[x_min:x_max, y_min:y_max])) + except ValueError: + return img + + def __repr__(self): + return self.__class__.__name__ + '(offset={})'.format(self.offset) + + +def window_img(dcm, width=None, level=None, norm=True): + try: + pixels = dcm.pixel_array.astype(np.float32) * dcm.RescaleSlope + dcm.RescaleIntercept + except ValueError: + return np.zeros((512, 512)) + + # Pad the image if it isn't square + if pixels.shape[0] != pixels.shape[1]: + (a, b) = pixels.shape + if a > b: + padding = ((0, 0), ((a - b) // 2, (a - b) // 2)) + else: + padding = (((b - a) // 2, (b - a) // 2), (0, 0)) + pixels = np.pad(pixels, padding, mode='constant', constant_values=0) + + if not width: + width = dcm.WindowWidth + if type(width) != pydicom.valuerep.DSfloat: + width = width[0] + if not level: + level = dcm.WindowCenter + if type(level) != pydicom.valuerep.DSfloat: + level = level[0] + lower = level - (width / 2) + upper = level + (width / 2) + img = np.clip(pixels, lower, upper) + + if norm: + return (img - lower) / (upper - lower) + else: + return img + + +def dcm_to_png(row, image_dirs, dataset, width, level, crop, crop_head, output_path): + r_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["red"] + ".dcm")) + g_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["green"] + ".dcm")) + b_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["blue"] + ".dcm")) + r = window_img(r_dcm, width, level) + g = window_img(g_dcm, width, level) + b = window_img(b_dcm, width, level) + img = np.stack([r, g, b], -1) + img = (img * 255).astype(np.uint8) + im = Image.fromarray(img) + + if crop: + im = crop_head(im) + + im.save(os.path.join(output_path, row["green"] + ".png")) + + +def prepare_png_images(dataset, folder_name, width=None, level=None, crop=True): + start = time() + + triplet_dfs = { + "train": os.path.join(data_path, "train_triplets.csv"), + "test_stage_1": os.path.join(data_path, "stage_1_test_triplets.csv") + } + + image_dirs = { + "train": os.path.join(data_path, "stage_1_train_images"), + "test_stage_1": os.path.join(data_path, "stage_1_test_images") + } + + output_path = os.path.join(data_path, "png", dataset, f"{folder_name}") + + if not os.path.exists(output_path): + os.makedirs(output_path) + + triplets = pd.read_csv(triplet_dfs[dataset]) + crop_head = CropHead() + + Parallel(n_jobs=4)(delayed(dcm_to_png)(row, image_dirs, dataset, width, level, crop, crop_head, output_path) for _, row in tqdm( + tqdm(triplets.iterrows(), total=len(triplets), desc=dataset) + )) + + # for _, row in tqdm(triplets.iterrows(), total=len(triplets), desc=dataset): + # r_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["red"] + ".dcm")) + # g_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["green"] + ".dcm")) + # b_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["blue"] + ".dcm")) + # r = window_img(r_dcm, width, level) + # g = window_img(g_dcm, width, level) + # b = window_img(b_dcm, width, level) + # img = np.stack([r, g, b], -1) + # img = (img * 255).astype(np.uint8) + # im = Image.fromarray(img) + # + # if crop: + # im = crop_head(im) + # + # im.save(os.path.join(output_path, row["green"] + ".png")) + + print("Done in", (time() - start) // 60, "minutes") + + +if __name__ == '__main__': + prepare_png_images("train", "adjacent-brain-cropped", 80, 40, crop=True) + # prepare_png_images("test_stage_1", "adjacent-brain-cropped", 80, 40, crop=True)