Kaggle-RSNA / Git / [95f789] /src/preprocessing2.py

Models:
DavidFeaster/
Kaggle-RSNA
Downloads: 1
[95f789]: / src / preprocessing2.py
History
Download this file
153 lines (121 with data), 5.3 kB

import numpy as np
import pandas as pd
from PIL import Image, ImageFile
from scipy import ndimage
import pydicom
import os
from tqdm import tqdm
from time import time
from joblib import Parallel, delayed
ImageFile.LOAD_TRUNCATED_IMAGES = True


data_path = "/home/lab/bac/kaggle_data/rsna/"


class CropHead(object):
    def __init__(self, offset=5):
        """
        Originally made as a image transform, but too slow to run on the fly :(
        :param offset: Pixel offset to apply to the crop so that it isn't too tight
        """
        self.offset = offset

    def __call__(self, img):
        """
        Crops a CT image to so that as much black area is removed as possible
        :param img: PIL image
        :return: Cropped image
        """
        try:
            if type(img) != np.array:
                img_array = np.array(img)

            labeled_blobs, number_of_blobs = ndimage.label(img_array)
            blob_sizes = np.bincount(labeled_blobs.flatten())
            head_blob = labeled_blobs == np.argmax(blob_sizes[1:]) + 1  # The number of the head blob
            head_blob = np.max(head_blob, axis=-1)

            mask = head_blob == 0
            rows = np.flatnonzero((~mask).sum(axis=1))
            cols = np.flatnonzero((~mask).sum(axis=0))

            x_min = max([rows.min() - self.offset, 0])
            x_max = min([rows.max() + self.offset + 1, img_array.shape[0]])
            y_min = max([cols.min() - self.offset, 0])
            y_max = min([cols.max() + self.offset + 1, img_array.shape[1]])

            return Image.fromarray(np.uint8(img_array[x_min:x_max, y_min:y_max]))
        except ValueError:
            return img

    def __repr__(self):
        return self.__class__.__name__ + '(offset={})'.format(self.offset)


def window_img(dcm, width=None, level=None, norm=True):
    try:
        pixels = dcm.pixel_array.astype(np.float32) * dcm.RescaleSlope + dcm.RescaleIntercept
    except ValueError:
        return np.zeros((512, 512))

    # Pad the image if it isn't square
    if pixels.shape[0] != pixels.shape[1]:
        (a, b) = pixels.shape
        if a > b:
            padding = ((0, 0), ((a - b) // 2, (a - b) // 2))
        else:
            padding = (((b - a) // 2, (b - a) // 2), (0, 0))
        pixels = np.pad(pixels, padding, mode='constant', constant_values=0)

    if not width:
        width = dcm.WindowWidth
        if type(width) != pydicom.valuerep.DSfloat:
            width = width[0]
    if not level:
        level = dcm.WindowCenter
        if type(level) != pydicom.valuerep.DSfloat:
            level = level[0]
    lower = level - (width / 2)
    upper = level + (width / 2)
    img = np.clip(pixels, lower, upper)

    if norm:
        return (img - lower) / (upper - lower)
    else:
        return img


def dcm_to_png(row, image_dirs, dataset, width, level, crop, crop_head, output_path):
    r_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["red"] + ".dcm"))
    g_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["green"] + ".dcm"))
    b_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["blue"] + ".dcm"))
    r = window_img(r_dcm, width, level)
    g = window_img(g_dcm, width, level)
    b = window_img(b_dcm, width, level)
    img = np.stack([r, g, b], -1)
    img = (img * 255).astype(np.uint8)
    im = Image.fromarray(img)

    if crop:
        im = crop_head(im)

    im.save(os.path.join(output_path, row["green"] + ".png"))


def prepare_png_images(dataset, folder_name, width=None, level=None, crop=True):
    start = time()

    triplet_dfs = {
        "train": os.path.join(data_path, "train_triplets.csv"),
        "test_stage_1": os.path.join(data_path, "stage_1_test_triplets.csv")
    }

    image_dirs = {
        "train": os.path.join(data_path, "stage_1_train_images"),
        "test_stage_1": os.path.join(data_path, "stage_1_test_images")
    }

    output_path = os.path.join(data_path, "png", dataset, f"{folder_name}")

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    triplets = pd.read_csv(triplet_dfs[dataset])
    crop_head = CropHead()

    Parallel(n_jobs=4)(delayed(dcm_to_png)(row, image_dirs, dataset, width, level, crop, crop_head, output_path) for _, row in tqdm(
        tqdm(triplets.iterrows(), total=len(triplets), desc=dataset)
    ))

    # for _, row in tqdm(triplets.iterrows(), total=len(triplets), desc=dataset):
    #     r_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["red"] + ".dcm"))
    #     g_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["green"] + ".dcm"))
    #     b_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row["blue"] + ".dcm"))
    #     r = window_img(r_dcm, width, level)
    #     g = window_img(g_dcm, width, level)
    #     b = window_img(b_dcm, width, level)
    #     img = np.stack([r, g, b], -1)
    #     img = (img * 255).astype(np.uint8)
    #     im = Image.fromarray(img)
    #
    #     if crop:
    #         im = crop_head(im)
    #
    #     im.save(os.path.join(output_path, row["green"] + ".png"))

    print("Done in", (time() - start) // 60, "minutes")


if __name__ == '__main__':
    prepare_png_images("train", "adjacent-brain-cropped", 80, 40, crop=True)
    # prepare_png_images("test_stage_1", "adjacent-brain-cropped", 80, 40, crop=True)