{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using TensorFlow backend.\n" ] } ], "source": [ "import os\n", "import pickle\n", "import random\n", "import glob\n", "from glob import glob\n", "import datetime\n", "import pandas as pd\n", "import numpy as np\n", "import cv2\n", "import pydicom\n", "from tqdm import tqdm\n", "from joblib import delayed, Parallel\n", "import zipfile\n", "from pydicom.filebase import DicomBytesIO\n", "import sys\n", "from PIL import Image\n", "import cv2\n", "import pickle\n", "\n", "\n", "import click\n", "\n", "\n", "\n", "from joblib import delayed, Parallel\n", "import random\n", "\n", "\n", "from scipy import ndimage\n", "import pydicom\n", "from skimage import exposure\n", "\n", "base_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/'\n", "TRAIN_DIR = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/stage_2_train/'\n", "TEST_DIR = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/stage_2_test/'\n", "os.listdir(base_url)\n", "\n", "import keras\n", "from keras.models import model_from_json\n", "import tensorflow as tf\n", "from keras.models import Sequential, Model\n", "from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, GlobalAveragePooling2D\n", "from keras.applications.inception_v3 import InceptionV3\n", "\n", "# importing pyplot and image from matplotlib \n", "import matplotlib.pyplot as plt \n", "import matplotlib.image as mpimg \n", "\n", "\n", "from keras.preprocessing import image\n", "import albumentations as A\n", "\n", "\n", "\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "base_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/'\n", "train_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/png/train/adjacent-brain-cropped/'\n", "dcm_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/stage_2_train/'" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Diagnosisanyepiduralintraparenchymalintraventricularsubarachnoidsubdural
ImageID
ID_000012eaf000000
ID_000039fa0000000
ID_00005679d000000
ID_00008ce3c000000
ID_0000950d7000000
\n", "
" ], "text/plain": [ "Diagnosis any epidural intraparenchymal intraventricular subarachnoid \\\n", "ImageID \n", "ID_000012eaf 0 0 0 0 0 \n", "ID_000039fa0 0 0 0 0 0 \n", "ID_00005679d 0 0 0 0 0 \n", "ID_00008ce3c 0 0 0 0 0 \n", "ID_0000950d7 0 0 0 0 0 \n", "\n", "Diagnosis subdural \n", "ImageID \n", "ID_000012eaf 0 \n", "ID_000039fa0 0 \n", "ID_00005679d 0 \n", "ID_00008ce3c 0 \n", "ID_0000950d7 0 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df = pd.read_csv(f'{base_url}/stage_2_train.csv').drop_duplicates()\n", "train_df['ImageID'] = train_df['ID'].str.slice(stop=12)\n", "train_df['Diagnosis'] = train_df['ID'].str.slice(start=13)\n", "train_labels = train_df.pivot(index=\"ImageID\", columns=\"Diagnosis\", values=\"Label\")\n", "train_labels.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "train_metadata = pd.read_parquet(f'{base_url}/train_metadata.parquet.gzip')\n", "test_metadata = pd.read_parquet(f'{base_url}/test_metadata.parquet.gzip')\n", "\n", "train_metadata[\"Dataset\"] = \"train\"\n", "test_metadata[\"Dataset\"] = \"test\"\n", "\n", "train_metadata = train_metadata.join(train_labels)\n", "\n", "metadata = pd.concat([train_metadata, test_metadata], sort=True)\n", "metadata.sort_values(by=\"ImagePositionPatient_2\", inplace=True, ascending=False)\n", "metadata.sort_values(['PatientID','ImagePositionPatient_2'],inplace=True)\n", "metadata.drop(['ID_6431af929'],inplace = True)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BitsAllocatedBitsStoredColumnsDatasetHighBitImageOrientationPatient_0ImageOrientationPatient_1ImageOrientationPatient_2ImageOrientationPatient_3ImageOrientationPatient_4...StudyIDStudyInstanceUIDWindowCenterWindowWidthanyepiduralintraparenchymalintraventricularsubarachnoidsubdural
Image
ID_45785016b1616512train151.00.00.00.00.993572...ID_66929e09d430.080.00.00.00.00.00.00.0
ID_37f32aed21616512train151.00.00.00.00.993572...ID_66929e09d430.080.00.00.00.00.00.00.0
ID_1b9de29221616512train151.00.00.00.00.993572...ID_66929e09d430.080.00.00.00.00.00.00.0
ID_d61a6a7b91616512train151.00.00.00.00.993572...ID_66929e09d430.080.00.00.00.00.00.00.0
ID_406c821121616512train151.00.00.00.00.993572...ID_66929e09d430.080.00.00.00.00.00.00.0
\n", "

5 rows × 36 columns

\n", "
" ], "text/plain": [ " BitsAllocated BitsStored Columns Dataset HighBit \\\n", "Image \n", "ID_45785016b 16 16 512 train 15 \n", "ID_37f32aed2 16 16 512 train 15 \n", "ID_1b9de2922 16 16 512 train 15 \n", "ID_d61a6a7b9 16 16 512 train 15 \n", "ID_406c82112 16 16 512 train 15 \n", "\n", " ImageOrientationPatient_0 ImageOrientationPatient_1 \\\n", "Image \n", "ID_45785016b 1.0 0.0 \n", "ID_37f32aed2 1.0 0.0 \n", "ID_1b9de2922 1.0 0.0 \n", "ID_d61a6a7b9 1.0 0.0 \n", "ID_406c82112 1.0 0.0 \n", "\n", " ImageOrientationPatient_2 ImageOrientationPatient_3 \\\n", "Image \n", "ID_45785016b 0.0 0.0 \n", "ID_37f32aed2 0.0 0.0 \n", "ID_1b9de2922 0.0 0.0 \n", "ID_d61a6a7b9 0.0 0.0 \n", "ID_406c82112 0.0 0.0 \n", "\n", " ImageOrientationPatient_4 ... StudyID StudyInstanceUID \\\n", "Image ... \n", "ID_45785016b 0.993572 ... ID_66929e09d4 \n", "ID_37f32aed2 0.993572 ... ID_66929e09d4 \n", "ID_1b9de2922 0.993572 ... ID_66929e09d4 \n", "ID_d61a6a7b9 0.993572 ... ID_66929e09d4 \n", "ID_406c82112 0.993572 ... ID_66929e09d4 \n", "\n", " WindowCenter WindowWidth any epidural intraparenchymal \\\n", "Image \n", "ID_45785016b 30.0 80.0 0.0 0.0 0.0 \n", "ID_37f32aed2 30.0 80.0 0.0 0.0 0.0 \n", "ID_1b9de2922 30.0 80.0 0.0 0.0 0.0 \n", "ID_d61a6a7b9 30.0 80.0 0.0 0.0 0.0 \n", "ID_406c82112 30.0 80.0 0.0 0.0 0.0 \n", "\n", " intraventricular subarachnoid subdural \n", "Image \n", "ID_45785016b 0.0 0.0 0.0 \n", "ID_37f32aed2 0.0 0.0 0.0 \n", "ID_1b9de2922 0.0 0.0 0.0 \n", "ID_d61a6a7b9 0.0 0.0 0.0 \n", "ID_406c82112 0.0 0.0 0.0 \n", "\n", "[5 rows x 36 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metadata.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "test_df = metadata[metadata['Dataset'] == 'test'].iloc[:,:-6].drop(['Dataset'], axis= 1)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "train_df = metadata[metadata['Dataset'] == 'train'].drop(['Dataset'],axis=1)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(752802, 6)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_y = train_df[['any','epidural','intraparenchymal','intraventricular', 'subarachnoid','subdural']]\n", "train_y.shape" ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(752802, 35)" ] }, "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.shape" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "18938" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.PatientID.nunique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Dividing dataset into K-fold with equal patientID" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "with open('df.pkl', 'rb') as f:\n", " df = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDLabelfilepathPatientIDStudyIDSeriesID
Subtypeanyepiduralintraparenchymalintraventricularsubarachnoidsubdural
752798ID_ffff82e46000000/home/ubuntu/kaggle/rsna-intracranial-hemorrha...ID_a85c9d08ID_eca4bf46acID_3ef9b97743
752799ID_ffff922b9100100/home/ubuntu/kaggle/rsna-intracranial-hemorrha...ID_5964c5e5ID_b47ca0ad05ID_6d2a9b2810
752800ID_ffffb670a100010/home/ubuntu/kaggle/rsna-intracranial-hemorrha...ID_4f7414e4ID_ffb2e70ba3ID_87b33b4a10
752801ID_ffffcbff8000000/home/ubuntu/kaggle/rsna-intracranial-hemorrha...ID_a5382712ID_ff0ba45814ID_bd174db91c
752802ID_fffff9393000000/home/ubuntu/kaggle/rsna-intracranial-hemorrha...ID_41db05dfID_7c887292d5ID_dff8d8efd5
\n", "
" ], "text/plain": [ " ID Label \\\n", "Subtype any epidural intraparenchymal intraventricular \n", "752798 ID_ffff82e46 0 0 0 0 \n", "752799 ID_ffff922b9 1 0 0 1 \n", "752800 ID_ffffb670a 1 0 0 0 \n", "752801 ID_ffffcbff8 0 0 0 0 \n", "752802 ID_fffff9393 0 0 0 0 \n", "\n", " \\\n", "Subtype subarachnoid subdural \n", "752798 0 0 \n", "752799 0 0 \n", "752800 1 0 \n", "752801 0 0 \n", "752802 0 0 \n", "\n", " filepath PatientID \\\n", "Subtype \n", "752798 /home/ubuntu/kaggle/rsna-intracranial-hemorrha... ID_a85c9d08 \n", "752799 /home/ubuntu/kaggle/rsna-intracranial-hemorrha... ID_5964c5e5 \n", "752800 /home/ubuntu/kaggle/rsna-intracranial-hemorrha... ID_4f7414e4 \n", "752801 /home/ubuntu/kaggle/rsna-intracranial-hemorrha... ID_a5382712 \n", "752802 /home/ubuntu/kaggle/rsna-intracranial-hemorrha... ID_41db05df \n", "\n", " StudyID SeriesID \n", "Subtype \n", "752798 ID_eca4bf46ac ID_3ef9b97743 \n", "752799 ID_b47ca0ad05 ID_6d2a9b2810 \n", "752800 ID_ffb2e70ba3 ID_87b33b4a10 \n", "752801 ID_ff0ba45814 ID_bd174db91c \n", "752802 ID_7c887292d5 ID_dff8d8efd5 " ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Distribution of number of CT-Scan for Each PatientID" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAATdklEQVR4nO3de4xcZ3nH8e9DDLmZxg4JK9d26yAsmhA3t1VimgqtE5o4CcJRlUhGETjIrf8JaqhSgdOKhksiggoEkADVwinhIpY0hMZygNRyvG2plJvJxXZMZEOs4CSNoXZMnUCE6dM/5l2Ybma9F+/OHPv9fqTRznnPe+Y858zMb86+c2YmMhNJUh1e0+sCJEndY+hLUkUMfUmqiKEvSRUx9CWpIjN6XcChnHLKKblgwYJJL//SSy9x4oknTl1BU6SpdUFza2tqXdDc2ppaFzS3tqbWBROrbfPmzT/PzFM7zszMxl7OO++8PBybNm06rOWnS1PrymxubU2tK7O5tTW1rszm1tbUujInVhvwSI6Sqw7vSFJFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRRr9NQy1WLD63o7tu269osuVSDraeaQvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqyLhDPyKOiYhHI2J9mT4tIh6MiB0R8a2IeF1pP7ZM7yzzF7Tdxo2l/amIuHSqN0aSdGgTOdK/HtjeNv1J4LbMXAjsA1aW9pXAvsx8M3Bb6UdEnAEsB94KLAW+GBHHHF75kqSJGFfoR8Q84Argy2U6gIuAu0qXO4Ary/VlZZoy/+LSfxkwmJmvZObTwE7g/KnYCEnS+ERmjt0p4i7gE8Drgb8BrgUeKEfzRMR84HuZeWZEbAWWZubuMu/HwAXAR8oyXy/ta8syd41Y1ypgFUBfX995g4ODk964AwcOMHPmzEkvP11G1rXl2f0d+y2ae1K3SvqtI2WfNUlTa2tqXdDc2ppaF0ystiVLlmzOzP5O82aMtXBEvBPYk5mbI2JguLlD1xxj3qGW+V1D5hpgDUB/f38ODAyM7DJuQ0NDHM7y02VkXdeuvrdjv13XDHRsn05Hyj5rkqbW1tS6oLm1NbUumLraxgx94ELgXRFxOXAc8HvAZ4FZETEjMw8C84DnSv/dwHxgd0TMAE4C9ra1D2tfRpLUBWOO6WfmjZk5LzMX0Hoj9v7MvAbYBFxVuq0A7inX15Vpyvz7szWGtA5YXs7uOQ1YCDw0ZVsiSRrTeI70R/MhYDAibgYeBdaW9rXA1yJiJ60j/OUAmbktIu4EngQOAtdl5m8OY/2SpAmaUOhn5hAwVK7/hA5n32Tmr4CrR1n+FuCWiRYpSZoafiJXkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUZM/Qj4riIeCgiHo+IbRHx0dJ+WkQ8GBE7IuJbEfG60n5smd5Z5i9ou60bS/tTEXHpdG2UJKmz8RzpvwJclJlnAWcDSyNiMfBJ4LbMXAjsA1aW/iuBfZn5ZuC20o+IOANYDrwVWAp8MSKOmcqNkSQd2pihny0HyuRryyWBi4C7SvsdwJXl+rIyTZl/cUREaR/MzFcy82lgJ3D+lGyFJGlcIjPH7tQ6It8MvBn4AvAPwAPlaJ6ImA98LzPPjIitwNLM3F3m/Ri4APhIWebrpX1tWeauEetaBawC6OvrO29wcHDSG3fgwAFmzpw56eWny8i6tjy7v2O/RXNP6lZJv3Wk7LMmaWptTa0LmltbU+uCidW2ZMmSzZnZ32nejPHcQGb+Bjg7ImYB3wFO79St/I1R5o3WPnJda4A1AP39/TkwMDCeEjsaGhricJafLiPrunb1vR377bpmoGP7dDpS9lmTNLW2ptYFza2tqXXB1NU2obN3MvNFYAhYDMyKiOEXjXnAc+X6bmA+QJl/ErC3vb3DMpKkLhjP2TunliN8IuJ44B3AdmATcFXptgK4p1xfV6Yp8+/P1hjSOmB5ObvnNGAh8NBUbYgkaWzjGd6ZA9xRxvVfA9yZmesj4klgMCJuBh4F1pb+a4GvRcROWkf4ywEyc1tE3Ak8CRwErivDRpKkLhkz9DPzCeCcDu0/ocPZN5n5K+DqUW7rFuCWiZcpSZoKfiJXkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFZnR6wI0dRasvnfUebtuvaKLlUhqKkN/GowWvk0M3pG13rDoINeuvreRtUo6fA7vSFJFDH1JqoihL0kVMfQlqSKGviRVZMzQj4j5EbEpIrZHxLaIuL60nxwRGyJiR/k7u7RHRHw+InZGxBMRcW7bba0o/XdExIrp2yxJUifjOdI/CNyQmacDi4HrIuIMYDWwMTMXAhvLNMBlwMJyWQV8CVovEsBNwAXA+cBNwy8UkqTuGDP0M/P5zPxhuf4/wHZgLrAMuKN0uwO4slxfBnw1Wx4AZkXEHOBSYENm7s3MfcAGYOmUbo0k6ZAmNKYfEQuAc4AHgb7MfB5aLwzAG0u3ucBP2xbbXdpGa5ckdUlk5vg6RswE/g24JTPvjogXM3NW2/x9mTk7Iu4FPpGZPyjtG4EPAhcBx2bmzaX9w8DLmfnpEetZRWtYiL6+vvMGBwcnvXEHDhxg5syZk15+srY8u79j+6K5JwGvrmus/oe73kPd1shl+o6HF3458XVPt17dl+PR1NqaWhc0t7am1gUTq23JkiWbM7O/07xxfQ1DRLwW+Dbwjcy8uzS/EBFzMvP5Mnyzp7TvBua3LT4PeK60D4xoHxq5rsxcA6wB6O/vz4GBgZFdxm1oaIjDWX6yrh3taxiuGQBeXddY/Q93vYe6rZHL3LDoIJ/eMmPC655uvbovx6OptTW1LmhubU2tC6autvGcvRPAWmB7Zn6mbdY6YPgMnBXAPW3t7y1n8SwG9pfhn/uASyJidnkD95LSJknqkvEc6V8IvAfYEhGPlba/BW4F7oyIlcAzwNVl3neBy4GdwMvA+wAyc29EfBx4uPT7WGbunZKtkCSNy5ihX8bmY5TZF3fon8B1o9zW7cDtEylQkjR1/ESuJFXE0Jekihj6klQRQ1+SKuLPJTbYkfSzi5KODB7pS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkX85awuGv4lrBsWHeTaUX4VS5Kmk0f6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFRkz9CPi9ojYExFb29pOjogNEbGj/J1d2iMiPh8ROyPiiYg4t22ZFaX/johYMT2bI0k6lPEc6X8FWDqibTWwMTMXAhvLNMBlwMJyWQV8CVovEsBNwAXA+cBNwy8UkqTuGTP0M/Pfgb0jmpcBd5TrdwBXtrV/NVseAGZFxBzgUmBDZu7NzH3ABl79QiJJmmaRmWN3ilgArM/MM8v0i5k5q23+vsycHRHrgVsz8welfSPwIWAAOC4zby7tHwZ+mZmf6rCuVbT+S6Cvr++8wcHBSW/cgQMHmDlz5qSXn6wtz+4/5Py+4+GFX07+9hfNPWnC6x3vMsO1jda/V3p1X45HU2tral3Q3NqaWhdMrLYlS5Zszsz+TvOm+pezokNbHqL91Y2Za4A1AP39/TkwMDDpYoaGhjic5SdrrF/FumHRQT69ZfK7ftc1AxNe73iXGa5ttP690qv7cjyaWltT64Lm1tbUumDqapvs2TsvlGEbyt89pX03ML+t3zzguUO0S5K6aLKhvw4YPgNnBXBPW/t7y1k8i4H9mfk8cB9wSUTMLm/gXlLaJEldNOYYQ0R8k9aY/CkRsZvWWTi3AndGxErgGeDq0v27wOXATuBl4H0Ambk3Ij4OPFz6fSwzR745LEmaZmOGfma+e5RZF3fom8B1o9zO7cDtE6pOkjSl/ESuJFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRab6l7PUBQvG+GUuSRqNR/qSVBFDX5Iq4vBOJaZ7SGi029916xXTul5JE+ORviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcRP5KojP2ErHZ080pekihj6klQRh3fUEw4fSb1h6Etd4gudmsDhHUmqiKEvSRVxeEeapOHhmhsWHeTatqGb6R6umegwkcNKamfot/HJMfX8xa7Jm+591w2jbcNXlp7Y5Uo0zNBXVQ4VpEf6C8XIbRv5H4gEPQj9iFgKfA44BvhyZt7a7RokTc7R/J9VLboa+hFxDPAF4M+A3cDDEbEuM5/sZh0T5QP9d3o1XHMkDQdMdB81cRinadswVc/BXj6Xm5Ij3T7SPx/YmZk/AYiIQWAZ0OjQH00Tn6xHqy3P7p/QUMVU3jdH6/3cxO3qVU2jvSl/KFMV1t1+MYjMnJYb7riyiKuApZn5F2X6PcAFmfn+tj6rgFVl8i3AU4exylOAnx/G8tOlqXVBc2tral3Q3NqaWhc0t7am1gUTq+0PM/PUTjO6faQfHdr+36tOZq4B1kzJyiIeycz+qbitqdTUuqC5tTW1LmhubU2tC5pbW1PrgqmrrdsfztoNzG+bngc81+UaJKla3Q79h4GFEXFaRLwOWA6s63INklStrg7vZObBiHg/cB+tUzZvz8xt07jKKRkmmgZNrQuaW1tT64Lm1tbUuqC5tTW1LpiqYe9uvpErSeotv3BNkipi6EtSRY6K0I+I+RGxKSK2R8S2iLi+tJ8cERsiYkf5O7sHtR0XEQ9FxOOlto+W9tMi4sFS27fKG9tdFxHHRMSjEbG+YXXtiogtEfFYRDxS2ppwf86KiLsi4kfl8fa2htT1lrKvhi+/iIgPNKS2vy6P/a0R8c3ynGjK4+z6Ute2iPhAaev6PouI2yNiT0RsbWvrWEe0fD4idkbEExFx7kTWdVSEPnAQuCEzTwcWA9dFxBnAamBjZi4ENpbpbnsFuCgzzwLOBpZGxGLgk8BtpbZ9wMoe1AZwPbC9bbopdQEsycyz285NbsL9+Tng+5n5R8BZtPZdz+vKzKfKvjobOA94GfhOr2uLiLnAXwH9mXkmrRM4ltOAx1lEnAn8Ja1vCjgLeGdELKQ3++wrwNIRbaPVcRmwsFxWAV+a0Joy86i7APfQ+n6fp4A5pW0O8FSP6zoB+CFwAa1P1s0o7W8D7utBPfPKg+kiYD2tD8/1vK6y7l3AKSPaenp/Ar8HPE05AaIpdXWo8xLgP5tQGzAX+ClwMq2zBdcDlzbhcQZcTetLH4enPwx8sFf7DFgAbB3rcQX8I/DuTv3GczlajvR/KyIWAOcADwJ9mfk8QPn7xh7VdExEPAbsATYAPwZezMyDpctuWk+ObvssrQf5/5bpNzSkLmh9UvtfI2Jz+WoO6P39+SbgZ8A/lSGxL0fEiQ2oa6TlwDfL9Z7WlpnPAp8CngGeB/YDm2nG42wr8PaIeENEnABcTuvDo025P0erY/iFdNiE9t9RFfoRMRP4NvCBzPxFr+sZlpm/yda/3fNo/St5eqdu3awpIt4J7MnMze3NHbr26pzeCzPzXFr/yl4XEW/vUR3tZgDnAl/KzHOAl+jNENOoytj4u4B/7nUtAGUcehlwGvD7wIm07tORuv44y8zttIaZNgDfBx6nNVTcdIf1PD1qQj8iXksr8L+RmXeX5hciYk6ZP4fWkXbPZOaLwBCt9x1mRcTwh+N68XUUFwLviohdwCCtIZ7PNqAuADLzufJ3D62x6fPp/f25G9idmQ+W6btovQj0uq52lwE/zMwXynSva3sH8HRm/iwzfw3cDfwJzXmcrc3MczPz7cBeYAe932fDRqvjsL7O5qgI/YgIYC2wPTM/0zZrHbCiXF9Ba6y/27WdGhGzyvXjaT0JtgObgKt6VVtm3piZ8zJzAa3hgPsz85pe1wUQESdGxOuHr9Mao95Kj+/PzPwv4KcR8ZbSdDGtrwXv+eOszbv53dAO9L62Z4DFEXFCeZ4O77OeP84AIuKN5e8fAH9Oa9/1ep8NG62OdcB7y1k8i4H9w8NA49LtN0+m6Q2QP6X1780TwGPlcjmtMeqNtF69NwIn96C2PwYeLbVtBf6+tL8JeAjYSetf8WN7uP8GgPVNqavU8Hi5bAP+rrQ34f48G3ik3J//AsxuQl2lthOA/wZOamvreW3AR4Eflcf/14Bjm/A4K7X9B60XoceBi3u1z2i92DwP/JrWkfzK0eqgNbzzBVrvDW6hdWbUuNfl1zBIUkWOiuEdSdL4GPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIv8HcUE1AMKbz9IAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "tmp = df[\"PatientID\"].value_counts()\n", "tmp[tmp<100].hist(bins=50);" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "18938\n" ] } ], "source": [ "patient_id_train = set(df[\"PatientID\"].unique())\n", "print(len(patient_id_train))" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ImageSOPInstanceUIDPatientIDanyepiduralintraparenchymalintraventricularsubarachnoidsubdural
0ID_45785016bID_45785016bID_0002cd410.00.00.00.00.00.0
1ID_37f32aed2ID_37f32aed2ID_0002cd410.00.00.00.00.00.0
2ID_1b9de2922ID_1b9de2922ID_0002cd410.00.00.00.00.00.0
3ID_d61a6a7b9ID_d61a6a7b9ID_0002cd410.00.00.00.00.00.0
4ID_406c82112ID_406c82112ID_0002cd410.00.00.00.00.00.0
\n", "
" ], "text/plain": [ " Image SOPInstanceUID PatientID any epidural intraparenchymal \\\n", "0 ID_45785016b ID_45785016b ID_0002cd41 0.0 0.0 0.0 \n", "1 ID_37f32aed2 ID_37f32aed2 ID_0002cd41 0.0 0.0 0.0 \n", "2 ID_1b9de2922 ID_1b9de2922 ID_0002cd41 0.0 0.0 0.0 \n", "3 ID_d61a6a7b9 ID_d61a6a7b9 ID_0002cd41 0.0 0.0 0.0 \n", "4 ID_406c82112 ID_406c82112 ID_0002cd41 0.0 0.0 0.0 \n", "\n", " intraventricular subarachnoid subdural \n", "0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 " ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train = train_df[['SOPInstanceUID','PatientID','any','epidural','intraparenchymal','intraventricular','subarachnoid','subdural']]\n", "train.reset_index(inplace=True)\n", "train.head()" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(752802, 9)\n", "(752802, 9)\n" ] } ], "source": [ "IGNORE_IDS = ['ID_6431af929']\n", "print(train.shape)\n", "train = train[~train['SOPInstanceUID'].isin(IGNORE_IDS)]\n", "print(train.shape)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(18938,)\n" ] } ], "source": [ "patient_id = train[\"PatientID\"].unique() \n", "print(patient_id.shape) " ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(151865, 10)\n", "(148063, 10)\n", "(151306, 10)\n", "(150597, 10)\n", "(150971, 10)\n" ] } ], "source": [ "from sklearn.model_selection import KFold\n", "\n", "seed = 2020\n", "n_splits = 5\n", "kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed) # StratifiedKFold\n", "\n", "fold = 0\n", "for train_index, valid_index in kf.split(patient_id):\n", " df_train = train[train[\"PatientID\"].isin(patient_id[train_index])]\n", " df_valid = train[train[\"PatientID\"].isin(patient_id[valid_index])]\n", " \n", " df_train.to_csv(\"train_{}.csv\".format(fold), index=None)\n", " df_valid.to_csv(\"valid_{}.csv\".format(fold), index=None)\n", " \n", " fold += 1\n", " print(df_valid.shape)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ImageSOPInstanceUIDPatientIDanyepiduralintraparenchymalintraventricularsubarachnoidsubduralset
36ID_138d275c8ID_138d275c8ID_00054f3f0.00.00.00.00.00.00
37ID_447fa09d9ID_447fa09d9ID_00054f3f0.00.00.00.00.00.00
38ID_0f1298f68ID_0f1298f68ID_00054f3f0.00.00.00.00.00.00
39ID_c24918b79ID_c24918b79ID_00054f3f0.00.00.00.00.00.00
40ID_c0005a263ID_c0005a263ID_00054f3f0.00.00.00.00.00.00
.................................
752797ID_72e823e2cID_72e823e2cID_fffc2bd60.00.00.00.00.00.00
752798ID_4184c4f03ID_4184c4f03ID_fffc2bd60.00.00.00.00.00.00
752799ID_a8aca4f40ID_a8aca4f40ID_fffc2bd60.00.00.00.00.00.00
752800ID_716b72762ID_716b72762ID_fffc2bd60.00.00.00.00.00.00
752801ID_deb85caf0ID_deb85caf0ID_fffc2bd60.00.00.00.00.00.00
\n", "

601831 rows × 10 columns

\n", "
" ], "text/plain": [ " Image SOPInstanceUID PatientID any epidural \\\n", "36 ID_138d275c8 ID_138d275c8 ID_00054f3f 0.0 0.0 \n", "37 ID_447fa09d9 ID_447fa09d9 ID_00054f3f 0.0 0.0 \n", "38 ID_0f1298f68 ID_0f1298f68 ID_00054f3f 0.0 0.0 \n", "39 ID_c24918b79 ID_c24918b79 ID_00054f3f 0.0 0.0 \n", "40 ID_c0005a263 ID_c0005a263 ID_00054f3f 0.0 0.0 \n", "... ... ... ... ... ... \n", "752797 ID_72e823e2c ID_72e823e2c ID_fffc2bd6 0.0 0.0 \n", "752798 ID_4184c4f03 ID_4184c4f03 ID_fffc2bd6 0.0 0.0 \n", "752799 ID_a8aca4f40 ID_a8aca4f40 ID_fffc2bd6 0.0 0.0 \n", "752800 ID_716b72762 ID_716b72762 ID_fffc2bd6 0.0 0.0 \n", "752801 ID_deb85caf0 ID_deb85caf0 ID_fffc2bd6 0.0 0.0 \n", "\n", " intraparenchymal intraventricular subarachnoid subdural set \n", "36 0.0 0.0 0.0 0.0 0 \n", "37 0.0 0.0 0.0 0.0 0 \n", "38 0.0 0.0 0.0 0.0 0 \n", "39 0.0 0.0 0.0 0.0 0 \n", "40 0.0 0.0 0.0 0.0 0 \n", "... ... ... ... ... ... \n", "752797 0.0 0.0 0.0 0.0 0 \n", "752798 0.0 0.0 0.0 0.0 0 \n", "752799 0.0 0.0 0.0 0.0 0 \n", "752800 0.0 0.0 0.0 0.0 0 \n", "752801 0.0 0.0 0.0 0.0 0 \n", "\n", "[601831 rows x 10 columns]" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" } }, "nbformat": 4, "nbformat_minor": 4 }