[0fc53f]: / Notebook / Week 2 / preprocessing_1.ipynb

Download this file

289 lines (288 with data), 9.7 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['stage_2_test_triplets.csv',\n",
       " 'train_triplets.csv',\n",
       " 'png',\n",
       " 'stage_2_test',\n",
       " 'stage_2_train',\n",
       " 'stage_2_train.csv',\n",
       " 'test_metadata.parquet.gzip',\n",
       " 'stage_2_sample_submission.csv',\n",
       " 'train_metadata.parquet.gzip']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from PIL import Image, ImageFile\n",
    "from scipy import ndimage\n",
    "\n",
    "import os\n",
    "from time import time\n",
    "from joblib import Parallel, delayed\n",
    "ImageFile.LOAD_TRUNCATED_IMAGES = True\n",
    "\n",
    "import numpy as np\n",
    "import pydicom\n",
    "from scipy import ndimage\n",
    "from PIL import Image, ImageFile\n",
    "import matplotlib.pylab as plt\n",
    "from tqdm import tqdm_notebook, tqdm\n",
    "%matplotlib inline\n",
    "\n",
    "base_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/'\n",
    "TRAIN_DIR = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/stage_2_train'\n",
    "TEST_DIR = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/stage_2_test'\n",
    "os.listdir(base_url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def prepare_dicom(dcm, width=None, level=None, norm=True):\n",
    "    \"\"\"\n",
    "    Converts a DICOM object to a 16-bit Numpy array (in Hounsfield units)\n",
    "    :param dcm: DICOM Object\n",
    "    :return: Numpy array in int16\n",
    "    \"\"\"\n",
    "\n",
    "    try:\n",
    "        # https://www.kaggle.com/jhoward/cleaning-the-data-for-rapid-prototyping-fastai\n",
    "        if dcm.BitsStored == 12 and dcm.PixelRepresentation == 0 and dcm.RescaleIntercept > -100:\n",
    "            x = dcm.pixel_array + 1000\n",
    "            px_mode = 4096\n",
    "            x[x >= px_mode] = x[x >= px_mode] - px_mode\n",
    "            dcm.PixelData = x.tobytes()\n",
    "            dcm.RescaleIntercept = -1000\n",
    "\n",
    "        pixels = dcm.pixel_array.astype(np.float32) * dcm.RescaleSlope + dcm.RescaleIntercept\n",
    "    except ValueError as e:\n",
    "        print(\"ValueError with\", dcm.SOPInstanceUID, e)\n",
    "        return np.zeros((512, 512))\n",
    "\n",
    "    # Pad the image if it isn't square\n",
    "    if pixels.shape[0] != pixels.shape[1]:\n",
    "        (a, b) = pixels.shape\n",
    "        if a > b:\n",
    "            padding = ((0, 0), ((a - b) // 2, (a - b) // 2))\n",
    "        else:\n",
    "            padding = (((b - a) // 2, (b - a) // 2), (0, 0))\n",
    "        pixels = np.pad(pixels, padding, mode='constant', constant_values=0)\n",
    "        \n",
    "    if not width:\n",
    "        width = dcm.WindowWidth\n",
    "        if type(width) != pydicom.valuerep.DSfloat:\n",
    "            width = width[0]\n",
    "    if not level:\n",
    "        level = dcm.WindowCenter\n",
    "        if type(level) != pydicom.valuerep.DSfloat:\n",
    "            level = level[0]\n",
    "    lower = level - (width / 2)\n",
    "    upper = level + (width / 2)\n",
    "    img = np.clip(pixels, lower, upper)\n",
    "\n",
    "    if norm:\n",
    "        return (img - lower) / (upper - lower)\n",
    "    else:\n",
    "        return img"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "class CropHead(object):\n",
    "    def __init__(self, offset=10):\n",
    "        \"\"\"\n",
    "        Crops the head by labelling the objects in an image and keeping the second largest object (the largest object\n",
    "        is the background). This method removes most of the headrest\n",
    "\n",
    "        Originally made as a image transform for use with PyTorch, but too slow to run on the fly :(\n",
    "        :param offset: Pixel offset to apply to the crop so that it isn't too tight\n",
    "        \"\"\"\n",
    "        self.offset = offset\n",
    "\n",
    "    def crop_extents(self, img):\n",
    "        try:\n",
    "            if type(img) != np.array:\n",
    "                img_array = np.array(img)\n",
    "            else:\n",
    "                img_array = img\n",
    "\n",
    "            labeled_blobs, number_of_blobs = ndimage.label(img_array)\n",
    "            blob_sizes = np.bincount(labeled_blobs.flatten())\n",
    "            head_blob = labeled_blobs == np.argmax(blob_sizes[1:]) + 1  # The number of the head blob\n",
    "            head_blob = np.max(head_blob, axis=-1)\n",
    "\n",
    "            mask = head_blob == 0\n",
    "            rows = np.flatnonzero((~mask).sum(axis=1))\n",
    "            cols = np.flatnonzero((~mask).sum(axis=0))\n",
    "\n",
    "            x_min = max([rows.min() - self.offset, 0])\n",
    "            x_max = min([rows.max() + self.offset + 1, img_array.shape[0]])\n",
    "            y_min = max([cols.min() - self.offset, 0])\n",
    "            y_max = min([cols.max() + self.offset + 1, img_array.shape[1]])\n",
    "\n",
    "            return x_min, x_max, y_min, y_max\n",
    "        except ValueError:\n",
    "            return 0, 0, -1, -1\n",
    "\n",
    "    def __call__(self, img):\n",
    "        \"\"\"\n",
    "        Crops a CT image to so that as much black area is removed as possible\n",
    "        :param img: PIL image\n",
    "        :return: Cropped image\n",
    "        \"\"\"\n",
    "\n",
    "        x_min, x_max, y_min, y_max = self.crop_extents(img)\n",
    "\n",
    "        try:\n",
    "            if type(img) != np.array:\n",
    "                img_array = np.array(img)\n",
    "            else:\n",
    "                img_array = img\n",
    "\n",
    "            return Image.fromarray(np.uint8(img_array[x_min:x_max, y_min:y_max]))\n",
    "        except ValueError:\n",
    "            return img\n",
    "\n",
    "    def __repr__(self):\n",
    "        return self.__class__.__name__ + '(offset={})'.format(self.offset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "crop_head = CropHead()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def dcm_to_png(row, image_dirs, dataset, width, level, crop, crop_head, output_path):\n",
    "    r_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row[\"red\"] + \".dcm\"))\n",
    "    g_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row[\"green\"] + \".dcm\"))\n",
    "    b_dcm = pydicom.dcmread(os.path.join(image_dirs[dataset], row[\"blue\"] + \".dcm\"))\n",
    "    r = prepare_dicom(r_dcm, width, level)\n",
    "    g = prepare_dicom(g_dcm, width, level)\n",
    "    b = prepare_dicom(b_dcm, width, level)\n",
    "    img = np.stack([r, g, b], -1)\n",
    "    img = (img * 255).astype(np.uint8)\n",
    "    im = Image.fromarray(img)\n",
    "\n",
    "    if crop:\n",
    "        im = crop_head(im)\n",
    "\n",
    "    im.save(os.path.join(output_path, row[\"green\"] + \".png\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def prepare_png_images(dataset, folder_name, width=None, level=None, crop=True):\n",
    "    start = time()\n",
    "\n",
    "    triplet_dfs = {\n",
    "        \"train\": os.path.join(base_url, \"train_triplets.csv\"),\n",
    "        \"test_stage_2\": os.path.join(base_url, \"stage_2_test_triplets.csv\")\n",
    "    }\n",
    "\n",
    "    image_dirs = {\n",
    "        \"train\": os.path.join(base_url, \"stage_2_train\"),\n",
    "        \"test_stage_2\": os.path.join(base_url, \"stage_2_test\")\n",
    "    }\n",
    "\n",
    "    output_path = os.path.join(base_url, \"png\", dataset, f\"{folder_name}\")\n",
    "\n",
    "    if not os.path.exists(output_path):\n",
    "        os.makedirs(output_path)\n",
    "\n",
    "    triplets = pd.read_csv(triplet_dfs[dataset])\n",
    "    crop_head = CropHead()\n",
    "    for _, row in tqdm(triplets.iterrows()):\n",
    "        dcm_to_png(row, image_dirs, dataset, width, level, crop, crop_head, output_path)\n",
    "\n",
    "\n",
    "    print(\"Done in\", (time() - start) // 60, \"minutes\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prepare_png_images(\"train\", \"adjacent-brain-cropped\", 80, 40, crop=True)\n",
    "# prepare_png_images(\"test_stage_1\", \"adjacent-brain-cropped\", 80, 40, crop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}