{ "cells": [ { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import pydicom\n", "import matplotlib.image as mpimg\n", "from tqdm import tqdm_notebook\n", "import cv2\n", "import os\n", "import re\n", "from scipy import ndimage\n", "from skimage import morphology\n", "import PIL" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "base_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/'" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "TRAIN_DIR = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/stage_2_train'\n", "TEST_DIR = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/stage_2_test'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "752803\r\n" ] } ], "source": [ "! ls {TRAIN_DIR} | wc -l" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "121232\r\n" ] } ], "source": [ "! ls {TEST_DIR} | wc -l" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ID_000012eaf.dcm\n", "ID_000039fa0.dcm\n", "ID_00005679d.dcm\n", "ID_00008ce3c.dcm\n", "ID_0000950d7.dcm\n", "ls: write error: Broken pipe\n" ] } ], "source": [ "! ls {TRAIN_DIR} | head -n 5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prepare the labels & metadata\n", "The metadata was extracted beforehand using pydicom. This takes a while so I saved the results in these parquet files so they don't need to be generated each time." ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Diagnosis | \n", "any | \n", "epidural | \n", "intraparenchymal | \n", "intraventricular | \n", "subarachnoid | \n", "subdural | \n", "
---|---|---|---|---|---|---|
ImageID | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
ID_000012eaf | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
ID_000039fa0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
ID_00005679d | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
ID_00008ce3c | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
ID_0000950d7 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
Image | \n", "ID_d45673798 | \n", "ID_74cfe18f9 | \n", "ID_d7e2f42ee | \n", "ID_e6c5352ea | \n", "ID_688b50fa0 | \n", "ID_ef18adb45 | \n", "ID_94544b40d | \n", "ID_2a019f628 | \n", "ID_9bffe2b90 | \n", "ID_7ccdde5eb | \n", "... | \n", "ID_72b376d48 | \n", "ID_bf64dc996 | \n", "ID_7355aedc3 | \n", "ID_89ce8ad00 | \n", "ID_3a25fd051 | \n", "ID_529052515 | \n", "ID_f540aa7fb | \n", "ID_5f0c548d7 | \n", "ID_2bc16380c | \n", "ID_c161feeeb | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
BitsAllocated | \n", "16 | \n", "16 | \n", "16 | \n", "16 | \n", "16 | \n", "16 | \n", "16 | \n", "16 | \n", "16 | \n", "16 | \n", "... | \n", "16 | \n", "16 | \n", "16 | \n", "16 | \n", "16 | \n", "16 | \n", "16 | \n", "16 | \n", "16 | \n", "16 | \n", "
BitsStored | \n", "16 | \n", "16 | \n", "12 | \n", "12 | \n", "16 | \n", "12 | \n", "16 | \n", "16 | \n", "12 | \n", "16 | \n", "... | \n", "16 | \n", "16 | \n", "12 | \n", "12 | \n", "12 | \n", "16 | \n", "16 | \n", "16 | \n", "12 | \n", "16 | \n", "
Columns | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "... | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "
HighBit | \n", "15 | \n", "15 | \n", "11 | \n", "11 | \n", "15 | \n", "11 | \n", "15 | \n", "15 | \n", "11 | \n", "15 | \n", "... | \n", "15 | \n", "15 | \n", "11 | \n", "11 | \n", "11 | \n", "15 | \n", "15 | \n", "15 | \n", "11 | \n", "15 | \n", "
ImageOrientationPatient_0 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "... | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
ImageOrientationPatient_1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
ImageOrientationPatient_2 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
ImageOrientationPatient_3 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
ImageOrientationPatient_4 | \n", "0.945519 | \n", "1 | \n", "1 | \n", "1 | \n", "0.993572 | \n", "1 | \n", "0.927184 | \n", "0.906308 | \n", "0.981627 | \n", "1 | \n", "... | \n", "1 | \n", "0.927184 | \n", "1 | \n", "1 | \n", "1 | \n", "0.927184 | \n", "0.992546 | \n", "0.992546 | \n", "1 | \n", "0.95882 | \n", "
ImageOrientationPatient_5 | \n", "-0.325568 | \n", "0 | \n", "0 | \n", "0 | \n", "-0.113203 | \n", "0 | \n", "-0.374607 | \n", "-0.422618 | \n", "-0.190809 | \n", "0 | \n", "... | \n", "0 | \n", "-0.374607 | \n", "0 | \n", "0 | \n", "0 | \n", "-0.374607 | \n", "-0.121869 | \n", "-0.121869 | \n", "0 | \n", "-0.284015 | \n", "
ImagePositionPatient_0 | \n", "-125 | \n", "-126.409 | \n", "-117 | \n", "-120 | \n", "-125 | \n", "-125 | \n", "-125 | \n", "-125 | \n", "-125 | \n", "-125 | \n", "... | \n", "-126.409 | \n", "-125 | \n", "-125 | \n", "-113 | \n", "-114.5 | \n", "-125 | \n", "-125 | \n", "-125 | \n", "-125 | \n", "-125 | \n", "
ImagePositionPatient_1 | \n", "-123.39 | \n", "-126.409 | \n", "-3 | \n", "4 | \n", "-114.296 | \n", "-18 | \n", "-95.198 | \n", "-114.888 | \n", "25.315 | \n", "-88.753 | \n", "... | \n", "-126.409 | \n", "-107.598 | \n", "-7 | \n", "5 | \n", "-1.5 | \n", "-64.598 | \n", "-119.368 | \n", "-124.568 | \n", "-8 | \n", "-119.852 | \n", "
ImagePositionPatient_2 | \n", "72.533 | \n", "72.5 | \n", "-123.5 | \n", "151.3 | \n", "68.7595 | \n", "123.8 | \n", "21.3178 | \n", "43.925 | \n", "190.972 | \n", "-6.642 | \n", "... | \n", "17.5 | \n", "157.373 | \n", "154.7 | \n", "148.8 | \n", "726.3 | \n", "119.517 | \n", "108.062 | \n", "4.56384 | \n", "127.9 | \n", "140.066 | \n", "
Modality | \n", "CT | \n", "CT | \n", "CT | \n", "CT | \n", "CT | \n", "CT | \n", "CT | \n", "CT | \n", "CT | \n", "CT | \n", "... | \n", "CT | \n", "CT | \n", "CT | \n", "CT | \n", "CT | \n", "CT | \n", "CT | \n", "CT | \n", "CT | \n", "CT | \n", "
PatientID | \n", "ID_815113f2 | \n", "ID_2cec0d8a | \n", "ID_85f4970c | \n", "ID_7dcb798f | \n", "ID_37d50406 | \n", "ID_7c193c9e | \n", "ID_520fd258 | \n", "ID_9481b02b | \n", "ID_d96101fa | \n", "ID_2f572f12 | \n", "... | \n", "ID_96f03221 | \n", "ID_9e730fee | \n", "ID_d55f931c | \n", "ID_c1ee6b75 | \n", "ID_e9fe4085 | \n", "ID_3c45cca3 | \n", "ID_1a9dfcf1 | \n", "ID_f52de020 | \n", "ID_cb69cfb2 | \n", "ID_ec0c2ac1 | \n", "
PhotometricInterpretation | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "... | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "MONOCHROME2 | \n", "
PixelRepresentation | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "1 | \n", "... | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "1 | \n", "0 | \n", "1 | \n", "
PixelSpacing_0 | \n", "0.488281 | \n", "0.494751 | \n", "0.488281 | \n", "0.46875 | \n", "0.488281 | \n", "0.488281 | \n", "0.488281 | \n", "0.488281 | \n", "0.488281 | \n", "0.488281 | \n", "... | \n", "0.494751 | \n", "0.488281 | \n", "0.488281 | \n", "0.441406 | \n", "0.447266 | \n", "0.488281 | \n", "0.488281 | \n", "0.488281 | \n", "0.488281 | \n", "0.488281 | \n", "
PixelSpacing_1 | \n", "0.488281 | \n", "0.494751 | \n", "0.488281 | \n", "0.46875 | \n", "0.488281 | \n", "0.488281 | \n", "0.488281 | \n", "0.488281 | \n", "0.488281 | \n", "0.488281 | \n", "... | \n", "0.494751 | \n", "0.488281 | \n", "0.488281 | \n", "0.441406 | \n", "0.447266 | \n", "0.488281 | \n", "0.488281 | \n", "0.488281 | \n", "0.488281 | \n", "0.488281 | \n", "
RescaleIntercept | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "... | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "-1024 | \n", "
RescaleSlope | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "... | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
Rows | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "... | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "512 | \n", "
SOPInstanceUID | \n", "ID_d45673798 | \n", "ID_74cfe18f9 | \n", "ID_d7e2f42ee | \n", "ID_e6c5352ea | \n", "ID_688b50fa0 | \n", "ID_ef18adb45 | \n", "ID_94544b40d | \n", "ID_2a019f628 | \n", "ID_9bffe2b90 | \n", "ID_7ccdde5eb | \n", "... | \n", "ID_72b376d48 | \n", "ID_bf64dc996 | \n", "ID_7355aedc3 | \n", "ID_89ce8ad00 | \n", "ID_3a25fd051 | \n", "ID_529052515 | \n", "ID_f540aa7fb | \n", "ID_5f0c548d7 | \n", "ID_2bc16380c | \n", "ID_c161feeeb | \n", "
SamplesPerPixel | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "... | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
SeriesInstanceUID | \n", "ID_8149e17d30 | \n", "ID_6efe302471 | \n", "ID_3839240e5d | \n", "ID_2fe1c5668f | \n", "ID_c150ebbdbc | \n", "ID_22d6dfed97 | \n", "ID_f69325ded8 | \n", "ID_d56043f46d | \n", "ID_e21f790abb | \n", "ID_874ba6cb8c | \n", "... | \n", "ID_cae12af5f5 | \n", "ID_45066b940f | \n", "ID_db90430b69 | \n", "ID_983b1918c0 | \n", "ID_79b3575178 | \n", "ID_443e596744 | \n", "ID_78a079be31 | \n", "ID_9ae9e18e3f | \n", "ID_cd5661831b | \n", "ID_bb8d7f36b6 | \n", "
StudyID | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | ... | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
StudyInstanceUID | \n", "ID_341bf815cd | \n", "ID_8e108d767c | \n", "ID_95184e7272 | \n", "ID_0b499345fa | \n", "ID_4be47d5353 | \n", "ID_10f12f8722 | \n", "ID_66a7835a03 | \n", "ID_4fb3cbd983 | \n", "ID_74e913165e | \n", "ID_4598fe5069 | \n", "... | \n", "ID_51270c92ce | \n", "ID_1b4156b6a4 | \n", "ID_12f15437fd | \n", "ID_b2e6ee660a | \n", "ID_ea15b88236 | \n", "ID_7b72f23129 | \n", "ID_acc44cec79 | \n", "ID_f41f22fedd | \n", "ID_f986b69079 | \n", "ID_384ec0d10a | \n", "
WindowCenter | \n", "30 | \n", "35 | \n", "36 | \n", "36 | \n", "30 | \n", "36 | \n", "30 | \n", "30 | \n", "40 | \n", "50 | \n", "... | \n", "35 | \n", "30 | \n", "36 | \n", "36 | \n", "36 | \n", "30 | \n", "30 | \n", "30 | \n", "36 | \n", "30 | \n", "
WindowWidth | \n", "80 | \n", "135 | \n", "80 | \n", "80 | \n", "80 | \n", "80 | \n", "80 | \n", "80 | \n", "80 | \n", "100 | \n", "... | \n", "135 | \n", "80 | \n", "80 | \n", "80 | \n", "80 | \n", "80 | \n", "80 | \n", "80 | \n", "80 | \n", "80 | \n", "
29 rows × 752803 columns
\n", "\n", " | BitsAllocated | \n", "BitsStored | \n", "Columns | \n", "Dataset | \n", "HighBit | \n", "ImageOrientationPatient_0 | \n", "ImageOrientationPatient_1 | \n", "ImageOrientationPatient_2 | \n", "ImageOrientationPatient_3 | \n", "ImageOrientationPatient_4 | \n", "... | \n", "StudyID | \n", "StudyInstanceUID | \n", "WindowCenter | \n", "WindowWidth | \n", "any | \n", "epidural | \n", "intraparenchymal | \n", "intraventricular | \n", "subarachnoid | \n", "subdural | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Image | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
ID_24250ffbc | \n", "16 | \n", "12 | \n", "512 | \n", "train | \n", "11 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.920505 | \n", "... | \n", "\n", " | ID_6222a3935b | \n", "40.0 | \n", "80.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
ID_6e8c8d650 | \n", "16 | \n", "12 | \n", "512 | \n", "train | \n", "11 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.920505 | \n", "... | \n", "\n", " | ID_6222a3935b | \n", "40.0 | \n", "80.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
ID_ac042708d | \n", "16 | \n", "12 | \n", "512 | \n", "train | \n", "11 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.920505 | \n", "... | \n", "\n", " | ID_6222a3935b | \n", "40.0 | \n", "80.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
ID_d1e2a17a9 | \n", "16 | \n", "12 | \n", "512 | \n", "train | \n", "11 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.927184 | \n", "... | \n", "\n", " | ID_a5fb903898 | \n", "40.0 | \n", "80.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
ID_e1a1b45a5 | \n", "16 | \n", "12 | \n", "512 | \n", "train | \n", "11 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.920505 | \n", "... | \n", "\n", " | ID_6222a3935b | \n", "40.0 | \n", "80.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
5 rows × 36 columns
\n", "\n", " | BitsAllocated | \n", "BitsStored | \n", "Columns | \n", "Dataset | \n", "HighBit | \n", "ImageOrientationPatient_0 | \n", "ImageOrientationPatient_1 | \n", "ImageOrientationPatient_2 | \n", "ImageOrientationPatient_3 | \n", "ImageOrientationPatient_4 | \n", "... | \n", "StudyID | \n", "StudyInstanceUID | \n", "WindowCenter | \n", "WindowWidth | \n", "any | \n", "epidural | \n", "intraparenchymal | \n", "intraventricular | \n", "subarachnoid | \n", "subdural | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Image | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
ID_865122213 | \n", "16 | \n", "16 | \n", "512 | \n", "train | \n", "15 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.927184 | \n", "... | \n", "\n", " | ID_13eda5126c | \n", "30.0 | \n", "80.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
ID_51a161001 | \n", "16 | \n", "16 | \n", "512 | \n", "train | \n", "15 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.927184 | \n", "... | \n", "\n", " | ID_13eda5126c | \n", "30.0 | \n", "80.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
ID_9f9e9b705 | \n", "16 | \n", "16 | \n", "512 | \n", "train | \n", "15 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.927184 | \n", "... | \n", "\n", " | ID_13eda5126c | \n", "30.0 | \n", "80.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
ID_d985de5b7 | \n", "16 | \n", "16 | \n", "512 | \n", "train | \n", "15 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.927184 | \n", "... | \n", "\n", " | ID_13eda5126c | \n", "30.0 | \n", "80.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
ID_2837d7a95 | \n", "16 | \n", "16 | \n", "512 | \n", "train | \n", "15 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.927184 | \n", "... | \n", "\n", " | ID_13eda5126c | \n", "30.0 | \n", "80.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
5 rows × 36 columns
\n", "