Switch to side-by-side view

--- a
+++ b/Notebook/Week 3/k-fold.ipynb
@@ -0,0 +1,1272 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using TensorFlow backend.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import pickle\n",
+    "import random\n",
+    "import glob\n",
+    "from glob import glob\n",
+    "import datetime\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import cv2\n",
+    "import pydicom\n",
+    "from tqdm import tqdm\n",
+    "from joblib import delayed, Parallel\n",
+    "import zipfile\n",
+    "from pydicom.filebase import DicomBytesIO\n",
+    "import sys\n",
+    "from PIL import Image\n",
+    "import cv2\n",
+    "import pickle\n",
+    "\n",
+    "\n",
+    "import click\n",
+    "\n",
+    "\n",
+    "\n",
+    "from joblib import delayed, Parallel\n",
+    "import random\n",
+    "\n",
+    "\n",
+    "from scipy import ndimage\n",
+    "import pydicom\n",
+    "from skimage import exposure\n",
+    "\n",
+    "base_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/'\n",
+    "TRAIN_DIR = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/stage_2_train/'\n",
+    "TEST_DIR = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/stage_2_test/'\n",
+    "os.listdir(base_url)\n",
+    "\n",
+    "import keras\n",
+    "from keras.models import model_from_json\n",
+    "import tensorflow as tf\n",
+    "from keras.models import Sequential, Model\n",
+    "from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, GlobalAveragePooling2D\n",
+    "from keras.applications.inception_v3 import InceptionV3\n",
+    "\n",
+    "# importing pyplot and image from matplotlib \n",
+    "import matplotlib.pyplot as plt \n",
+    "import matplotlib.image as mpimg \n",
+    "\n",
+    "\n",
+    "from keras.preprocessing import image\n",
+    "import albumentations as A\n",
+    "\n",
+    "\n",
+    "\n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/'\n",
+    "train_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/png/train/adjacent-brain-cropped/'\n",
+    "dcm_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/stage_2_train/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>Diagnosis</th>\n",
+       "      <th>any</th>\n",
+       "      <th>epidural</th>\n",
+       "      <th>intraparenchymal</th>\n",
+       "      <th>intraventricular</th>\n",
+       "      <th>subarachnoid</th>\n",
+       "      <th>subdural</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ImageID</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>ID_000012eaf</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID_000039fa0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID_00005679d</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID_00008ce3c</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID_0000950d7</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "Diagnosis     any  epidural  intraparenchymal  intraventricular  subarachnoid  \\\n",
+       "ImageID                                                                         \n",
+       "ID_000012eaf    0         0                 0                 0             0   \n",
+       "ID_000039fa0    0         0                 0                 0             0   \n",
+       "ID_00005679d    0         0                 0                 0             0   \n",
+       "ID_00008ce3c    0         0                 0                 0             0   \n",
+       "ID_0000950d7    0         0                 0                 0             0   \n",
+       "\n",
+       "Diagnosis     subdural  \n",
+       "ImageID                 \n",
+       "ID_000012eaf         0  \n",
+       "ID_000039fa0         0  \n",
+       "ID_00005679d         0  \n",
+       "ID_00008ce3c         0  \n",
+       "ID_0000950d7         0  "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_df = pd.read_csv(f'{base_url}/stage_2_train.csv').drop_duplicates()\n",
+    "train_df['ImageID'] = train_df['ID'].str.slice(stop=12)\n",
+    "train_df['Diagnosis'] = train_df['ID'].str.slice(start=13)\n",
+    "train_labels = train_df.pivot(index=\"ImageID\", columns=\"Diagnosis\", values=\"Label\")\n",
+    "train_labels.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_metadata = pd.read_parquet(f'{base_url}/train_metadata.parquet.gzip')\n",
+    "test_metadata = pd.read_parquet(f'{base_url}/test_metadata.parquet.gzip')\n",
+    "\n",
+    "train_metadata[\"Dataset\"] = \"train\"\n",
+    "test_metadata[\"Dataset\"] = \"test\"\n",
+    "\n",
+    "train_metadata = train_metadata.join(train_labels)\n",
+    "\n",
+    "metadata = pd.concat([train_metadata, test_metadata], sort=True)\n",
+    "metadata.sort_values(by=\"ImagePositionPatient_2\", inplace=True, ascending=False)\n",
+    "metadata.sort_values(['PatientID','ImagePositionPatient_2'],inplace=True)\n",
+    "metadata.drop(['ID_6431af929'],inplace = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>BitsAllocated</th>\n",
+       "      <th>BitsStored</th>\n",
+       "      <th>Columns</th>\n",
+       "      <th>Dataset</th>\n",
+       "      <th>HighBit</th>\n",
+       "      <th>ImageOrientationPatient_0</th>\n",
+       "      <th>ImageOrientationPatient_1</th>\n",
+       "      <th>ImageOrientationPatient_2</th>\n",
+       "      <th>ImageOrientationPatient_3</th>\n",
+       "      <th>ImageOrientationPatient_4</th>\n",
+       "      <th>...</th>\n",
+       "      <th>StudyID</th>\n",
+       "      <th>StudyInstanceUID</th>\n",
+       "      <th>WindowCenter</th>\n",
+       "      <th>WindowWidth</th>\n",
+       "      <th>any</th>\n",
+       "      <th>epidural</th>\n",
+       "      <th>intraparenchymal</th>\n",
+       "      <th>intraventricular</th>\n",
+       "      <th>subarachnoid</th>\n",
+       "      <th>subdural</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Image</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>ID_45785016b</th>\n",
+       "      <td>16</td>\n",
+       "      <td>16</td>\n",
+       "      <td>512</td>\n",
+       "      <td>train</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.993572</td>\n",
+       "      <td>...</td>\n",
+       "      <td></td>\n",
+       "      <td>ID_66929e09d4</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID_37f32aed2</th>\n",
+       "      <td>16</td>\n",
+       "      <td>16</td>\n",
+       "      <td>512</td>\n",
+       "      <td>train</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.993572</td>\n",
+       "      <td>...</td>\n",
+       "      <td></td>\n",
+       "      <td>ID_66929e09d4</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID_1b9de2922</th>\n",
+       "      <td>16</td>\n",
+       "      <td>16</td>\n",
+       "      <td>512</td>\n",
+       "      <td>train</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.993572</td>\n",
+       "      <td>...</td>\n",
+       "      <td></td>\n",
+       "      <td>ID_66929e09d4</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID_d61a6a7b9</th>\n",
+       "      <td>16</td>\n",
+       "      <td>16</td>\n",
+       "      <td>512</td>\n",
+       "      <td>train</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.993572</td>\n",
+       "      <td>...</td>\n",
+       "      <td></td>\n",
+       "      <td>ID_66929e09d4</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID_406c82112</th>\n",
+       "      <td>16</td>\n",
+       "      <td>16</td>\n",
+       "      <td>512</td>\n",
+       "      <td>train</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.993572</td>\n",
+       "      <td>...</td>\n",
+       "      <td></td>\n",
+       "      <td>ID_66929e09d4</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 36 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              BitsAllocated  BitsStored  Columns Dataset  HighBit  \\\n",
+       "Image                                                               \n",
+       "ID_45785016b             16          16      512   train       15   \n",
+       "ID_37f32aed2             16          16      512   train       15   \n",
+       "ID_1b9de2922             16          16      512   train       15   \n",
+       "ID_d61a6a7b9             16          16      512   train       15   \n",
+       "ID_406c82112             16          16      512   train       15   \n",
+       "\n",
+       "              ImageOrientationPatient_0  ImageOrientationPatient_1  \\\n",
+       "Image                                                                \n",
+       "ID_45785016b                        1.0                        0.0   \n",
+       "ID_37f32aed2                        1.0                        0.0   \n",
+       "ID_1b9de2922                        1.0                        0.0   \n",
+       "ID_d61a6a7b9                        1.0                        0.0   \n",
+       "ID_406c82112                        1.0                        0.0   \n",
+       "\n",
+       "              ImageOrientationPatient_2  ImageOrientationPatient_3  \\\n",
+       "Image                                                                \n",
+       "ID_45785016b                        0.0                        0.0   \n",
+       "ID_37f32aed2                        0.0                        0.0   \n",
+       "ID_1b9de2922                        0.0                        0.0   \n",
+       "ID_d61a6a7b9                        0.0                        0.0   \n",
+       "ID_406c82112                        0.0                        0.0   \n",
+       "\n",
+       "              ImageOrientationPatient_4  ...  StudyID  StudyInstanceUID  \\\n",
+       "Image                                    ...                              \n",
+       "ID_45785016b                   0.993572  ...              ID_66929e09d4   \n",
+       "ID_37f32aed2                   0.993572  ...              ID_66929e09d4   \n",
+       "ID_1b9de2922                   0.993572  ...              ID_66929e09d4   \n",
+       "ID_d61a6a7b9                   0.993572  ...              ID_66929e09d4   \n",
+       "ID_406c82112                   0.993572  ...              ID_66929e09d4   \n",
+       "\n",
+       "              WindowCenter  WindowWidth  any epidural intraparenchymal  \\\n",
+       "Image                                                                    \n",
+       "ID_45785016b          30.0         80.0  0.0      0.0              0.0   \n",
+       "ID_37f32aed2          30.0         80.0  0.0      0.0              0.0   \n",
+       "ID_1b9de2922          30.0         80.0  0.0      0.0              0.0   \n",
+       "ID_d61a6a7b9          30.0         80.0  0.0      0.0              0.0   \n",
+       "ID_406c82112          30.0         80.0  0.0      0.0              0.0   \n",
+       "\n",
+       "              intraventricular  subarachnoid  subdural  \n",
+       "Image                                                   \n",
+       "ID_45785016b               0.0           0.0       0.0  \n",
+       "ID_37f32aed2               0.0           0.0       0.0  \n",
+       "ID_1b9de2922               0.0           0.0       0.0  \n",
+       "ID_d61a6a7b9               0.0           0.0       0.0  \n",
+       "ID_406c82112               0.0           0.0       0.0  \n",
+       "\n",
+       "[5 rows x 36 columns]"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "metadata.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_df = metadata[metadata['Dataset'] == 'test'].iloc[:,:-6].drop(['Dataset'], axis= 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_df = metadata[metadata['Dataset'] == 'train'].drop(['Dataset'],axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(752802, 6)"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_y =  train_df[['any','epidural','intraparenchymal','intraventricular', 'subarachnoid','subdural']]\n",
+    "train_y.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 126,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(752802, 35)"
+      ]
+     },
+     "execution_count": 126,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "18938"
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_df.PatientID.nunique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dividing dataset into K-fold with equal patientID"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('df.pkl', 'rb') as f:\n",
+    "    df = pickle.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th>ID</th>\n",
+       "      <th colspan=\"6\" halign=\"left\">Label</th>\n",
+       "      <th>filepath</th>\n",
+       "      <th>PatientID</th>\n",
+       "      <th>StudyID</th>\n",
+       "      <th>SeriesID</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Subtype</th>\n",
+       "      <th></th>\n",
+       "      <th>any</th>\n",
+       "      <th>epidural</th>\n",
+       "      <th>intraparenchymal</th>\n",
+       "      <th>intraventricular</th>\n",
+       "      <th>subarachnoid</th>\n",
+       "      <th>subdural</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>752798</th>\n",
+       "      <td>ID_ffff82e46</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/home/ubuntu/kaggle/rsna-intracranial-hemorrha...</td>\n",
+       "      <td>ID_a85c9d08</td>\n",
+       "      <td>ID_eca4bf46ac</td>\n",
+       "      <td>ID_3ef9b97743</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>752799</th>\n",
+       "      <td>ID_ffff922b9</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/home/ubuntu/kaggle/rsna-intracranial-hemorrha...</td>\n",
+       "      <td>ID_5964c5e5</td>\n",
+       "      <td>ID_b47ca0ad05</td>\n",
+       "      <td>ID_6d2a9b2810</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>752800</th>\n",
+       "      <td>ID_ffffb670a</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/home/ubuntu/kaggle/rsna-intracranial-hemorrha...</td>\n",
+       "      <td>ID_4f7414e4</td>\n",
+       "      <td>ID_ffb2e70ba3</td>\n",
+       "      <td>ID_87b33b4a10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>752801</th>\n",
+       "      <td>ID_ffffcbff8</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/home/ubuntu/kaggle/rsna-intracranial-hemorrha...</td>\n",
+       "      <td>ID_a5382712</td>\n",
+       "      <td>ID_ff0ba45814</td>\n",
+       "      <td>ID_bd174db91c</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>752802</th>\n",
+       "      <td>ID_fffff9393</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/home/ubuntu/kaggle/rsna-intracranial-hemorrha...</td>\n",
+       "      <td>ID_41db05df</td>\n",
+       "      <td>ID_7c887292d5</td>\n",
+       "      <td>ID_dff8d8efd5</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                   ID Label                                             \\\n",
+       "Subtype                 any epidural intraparenchymal intraventricular   \n",
+       "752798   ID_ffff82e46     0        0                0                0   \n",
+       "752799   ID_ffff922b9     1        0                0                1   \n",
+       "752800   ID_ffffb670a     1        0                0                0   \n",
+       "752801   ID_ffffcbff8     0        0                0                0   \n",
+       "752802   ID_fffff9393     0        0                0                0   \n",
+       "\n",
+       "                               \\\n",
+       "Subtype subarachnoid subdural   \n",
+       "752798             0        0   \n",
+       "752799             0        0   \n",
+       "752800             1        0   \n",
+       "752801             0        0   \n",
+       "752802             0        0   \n",
+       "\n",
+       "                                                  filepath    PatientID  \\\n",
+       "Subtype                                                                   \n",
+       "752798   /home/ubuntu/kaggle/rsna-intracranial-hemorrha...  ID_a85c9d08   \n",
+       "752799   /home/ubuntu/kaggle/rsna-intracranial-hemorrha...  ID_5964c5e5   \n",
+       "752800   /home/ubuntu/kaggle/rsna-intracranial-hemorrha...  ID_4f7414e4   \n",
+       "752801   /home/ubuntu/kaggle/rsna-intracranial-hemorrha...  ID_a5382712   \n",
+       "752802   /home/ubuntu/kaggle/rsna-intracranial-hemorrha...  ID_41db05df   \n",
+       "\n",
+       "               StudyID       SeriesID  \n",
+       "Subtype                                \n",
+       "752798   ID_eca4bf46ac  ID_3ef9b97743  \n",
+       "752799   ID_b47ca0ad05  ID_6d2a9b2810  \n",
+       "752800   ID_ffb2e70ba3  ID_87b33b4a10  \n",
+       "752801   ID_ff0ba45814  ID_bd174db91c  \n",
+       "752802   ID_7c887292d5  ID_dff8d8efd5  "
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Distribution of number of CT-Scan for Each PatientID"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAATdklEQVR4nO3de4xcZ3nH8e9DDLmZxg4JK9d26yAsmhA3t1VimgqtE5o4CcJRlUhGETjIrf8JaqhSgdOKhksiggoEkADVwinhIpY0hMZygNRyvG2plJvJxXZMZEOs4CSNoXZMnUCE6dM/5l2Ybma9F+/OHPv9fqTRznnPe+Y858zMb86+c2YmMhNJUh1e0+sCJEndY+hLUkUMfUmqiKEvSRUx9CWpIjN6XcChnHLKKblgwYJJL//SSy9x4oknTl1BU6SpdUFza2tqXdDc2ppaFzS3tqbWBROrbfPmzT/PzFM7zszMxl7OO++8PBybNm06rOWnS1PrymxubU2tK7O5tTW1rszm1tbUujInVhvwSI6Sqw7vSFJFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRRr9NQy1WLD63o7tu269osuVSDraeaQvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqyLhDPyKOiYhHI2J9mT4tIh6MiB0R8a2IeF1pP7ZM7yzzF7Tdxo2l/amIuHSqN0aSdGgTOdK/HtjeNv1J4LbMXAjsA1aW9pXAvsx8M3Bb6UdEnAEsB94KLAW+GBHHHF75kqSJGFfoR8Q84Argy2U6gIuAu0qXO4Ary/VlZZoy/+LSfxkwmJmvZObTwE7g/KnYCEnS+ERmjt0p4i7gE8Drgb8BrgUeKEfzRMR84HuZeWZEbAWWZubuMu/HwAXAR8oyXy/ta8syd41Y1ypgFUBfX995g4ODk964AwcOMHPmzEkvP11G1rXl2f0d+y2ae1K3SvqtI2WfNUlTa2tqXdDc2ppaF0ystiVLlmzOzP5O82aMtXBEvBPYk5mbI2JguLlD1xxj3qGW+V1D5hpgDUB/f38ODAyM7DJuQ0NDHM7y02VkXdeuvrdjv13XDHRsn05Hyj5rkqbW1tS6oLm1NbUumLraxgx94ELgXRFxOXAc8HvAZ4FZETEjMw8C84DnSv/dwHxgd0TMAE4C9ra1D2tfRpLUBWOO6WfmjZk5LzMX0Hoj9v7MvAbYBFxVuq0A7inX15Vpyvz7szWGtA5YXs7uOQ1YCDw0ZVsiSRrTeI70R/MhYDAibgYeBdaW9rXA1yJiJ60j/OUAmbktIu4EngQOAtdl5m8OY/2SpAmaUOhn5hAwVK7/hA5n32Tmr4CrR1n+FuCWiRYpSZoafiJXkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUZM/Qj4riIeCgiHo+IbRHx0dJ+WkQ8GBE7IuJbEfG60n5smd5Z5i9ou60bS/tTEXHpdG2UJKmz8RzpvwJclJlnAWcDSyNiMfBJ4LbMXAjsA1aW/iuBfZn5ZuC20o+IOANYDrwVWAp8MSKOmcqNkSQd2pihny0HyuRryyWBi4C7SvsdwJXl+rIyTZl/cUREaR/MzFcy82lgJ3D+lGyFJGlcIjPH7tQ6It8MvBn4AvAPwAPlaJ6ImA98LzPPjIitwNLM3F3m/Ri4APhIWebrpX1tWeauEetaBawC6OvrO29wcHDSG3fgwAFmzpw56eWny8i6tjy7v2O/RXNP6lZJv3Wk7LMmaWptTa0LmltbU+uCidW2ZMmSzZnZ32nejPHcQGb+Bjg7ImYB3wFO79St/I1R5o3WPnJda4A1AP39/TkwMDCeEjsaGhricJafLiPrunb1vR377bpmoGP7dDpS9lmTNLW2ptYFza2tqXXB1NU2obN3MvNFYAhYDMyKiOEXjXnAc+X6bmA+QJl/ErC3vb3DMpKkLhjP2TunliN8IuJ44B3AdmATcFXptgK4p1xfV6Yp8+/P1hjSOmB5ObvnNGAh8NBUbYgkaWzjGd6ZA9xRxvVfA9yZmesj4klgMCJuBh4F1pb+a4GvRcROWkf4ywEyc1tE3Ak8CRwErivDRpKkLhkz9DPzCeCcDu0/ocPZN5n5K+DqUW7rFuCWiZcpSZoKfiJXkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFZnR6wI0dRasvnfUebtuvaKLlUhqKkN/GowWvk0M3pG13rDoINeuvreRtUo6fA7vSFJFDH1JqoihL0kVMfQlqSKGviRVZMzQj4j5EbEpIrZHxLaIuL60nxwRGyJiR/k7u7RHRHw+InZGxBMRcW7bba0o/XdExIrp2yxJUifjOdI/CNyQmacDi4HrIuIMYDWwMTMXAhvLNMBlwMJyWQV8CVovEsBNwAXA+cBNwy8UkqTuGDP0M/P5zPxhuf4/wHZgLrAMuKN0uwO4slxfBnw1Wx4AZkXEHOBSYENm7s3MfcAGYOmUbo0k6ZAmNKYfEQuAc4AHgb7MfB5aLwzAG0u3ucBP2xbbXdpGa5ckdUlk5vg6RswE/g24JTPvjogXM3NW2/x9mTk7Iu4FPpGZPyjtG4EPAhcBx2bmzaX9w8DLmfnpEetZRWtYiL6+vvMGBwcnvXEHDhxg5syZk15+srY8u79j+6K5JwGvrmus/oe73kPd1shl+o6HF3458XVPt17dl+PR1NqaWhc0t7am1gUTq23JkiWbM7O/07xxfQ1DRLwW+Dbwjcy8uzS/EBFzMvP5Mnyzp7TvBua3LT4PeK60D4xoHxq5rsxcA6wB6O/vz4GBgZFdxm1oaIjDWX6yrh3taxiuGQBeXddY/Q93vYe6rZHL3LDoIJ/eMmPC655uvbovx6OptTW1LmhubU2tC6autvGcvRPAWmB7Zn6mbdY6YPgMnBXAPW3t7y1n8SwG9pfhn/uASyJidnkD95LSJknqkvEc6V8IvAfYEhGPlba/BW4F7oyIlcAzwNVl3neBy4GdwMvA+wAyc29EfBx4uPT7WGbunZKtkCSNy5ihX8bmY5TZF3fon8B1o9zW7cDtEylQkjR1/ESuJFXE0Jekihj6klQRQ1+SKuLPJTbYkfSzi5KODB7pS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkX85awuGv4lrBsWHeTaUX4VS5Kmk0f6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFRkz9CPi9ojYExFb29pOjogNEbGj/J1d2iMiPh8ROyPiiYg4t22ZFaX/johYMT2bI0k6lPEc6X8FWDqibTWwMTMXAhvLNMBlwMJyWQV8CVovEsBNwAXA+cBNwy8UkqTuGTP0M/Pfgb0jmpcBd5TrdwBXtrV/NVseAGZFxBzgUmBDZu7NzH3ABl79QiJJmmaRmWN3ilgArM/MM8v0i5k5q23+vsycHRHrgVsz8welfSPwIWAAOC4zby7tHwZ+mZmf6rCuVbT+S6Cvr++8wcHBSW/cgQMHmDlz5qSXn6wtz+4/5Py+4+GFX07+9hfNPWnC6x3vMsO1jda/V3p1X45HU2tral3Q3NqaWhdMrLYlS5Zszsz+TvOm+pezokNbHqL91Y2Za4A1AP39/TkwMDDpYoaGhjic5SdrrF/FumHRQT69ZfK7ftc1AxNe73iXGa5ttP690qv7cjyaWltT64Lm1tbUumDqapvs2TsvlGEbyt89pX03ML+t3zzguUO0S5K6aLKhvw4YPgNnBXBPW/t7y1k8i4H9mfk8cB9wSUTMLm/gXlLaJEldNOYYQ0R8k9aY/CkRsZvWWTi3AndGxErgGeDq0v27wOXATuBl4H0Ambk3Ij4OPFz6fSwzR745LEmaZmOGfma+e5RZF3fom8B1o9zO7cDtE6pOkjSl/ESuJFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRab6l7PUBQvG+GUuSRqNR/qSVBFDX5Iq4vBOJaZ7SGi029916xXTul5JE+ORviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcRP5KojP2ErHZ080pekihj6klQRh3fUEw4fSb1h6Etd4gudmsDhHUmqiKEvSRVxeEeapOHhmhsWHeTatqGb6R6umegwkcNKamfot/HJMfX8xa7Jm+591w2jbcNXlp7Y5Uo0zNBXVQ4VpEf6C8XIbRv5H4gEPQj9iFgKfA44BvhyZt7a7RokTc7R/J9VLboa+hFxDPAF4M+A3cDDEbEuM5/sZh0T5QP9d3o1XHMkDQdMdB81cRinadswVc/BXj6Xm5Ij3T7SPx/YmZk/AYiIQWAZ0OjQH00Tn6xHqy3P7p/QUMVU3jdH6/3cxO3qVU2jvSl/KFMV1t1+MYjMnJYb7riyiKuApZn5F2X6PcAFmfn+tj6rgFVl8i3AU4exylOAnx/G8tOlqXVBc2tral3Q3NqaWhc0t7am1gUTq+0PM/PUTjO6faQfHdr+36tOZq4B1kzJyiIeycz+qbitqdTUuqC5tTW1LmhubU2tC5pbW1PrgqmrrdsfztoNzG+bngc81+UaJKla3Q79h4GFEXFaRLwOWA6s63INklStrg7vZObBiHg/cB+tUzZvz8xt07jKKRkmmgZNrQuaW1tT64Lm1tbUuqC5tTW1LpiqYe9uvpErSeotv3BNkipi6EtSRY6K0I+I+RGxKSK2R8S2iLi+tJ8cERsiYkf5O7sHtR0XEQ9FxOOlto+W9tMi4sFS27fKG9tdFxHHRMSjEbG+YXXtiogtEfFYRDxS2ppwf86KiLsi4kfl8fa2htT1lrKvhi+/iIgPNKS2vy6P/a0R8c3ynGjK4+z6Ute2iPhAaev6PouI2yNiT0RsbWvrWEe0fD4idkbEExFx7kTWdVSEPnAQuCEzTwcWA9dFxBnAamBjZi4ENpbpbnsFuCgzzwLOBpZGxGLgk8BtpbZ9wMoe1AZwPbC9bbopdQEsycyz285NbsL9+Tng+5n5R8BZtPZdz+vKzKfKvjobOA94GfhOr2uLiLnAXwH9mXkmrRM4ltOAx1lEnAn8Ja1vCjgLeGdELKQ3++wrwNIRbaPVcRmwsFxWAV+a0Joy86i7APfQ+n6fp4A5pW0O8FSP6zoB+CFwAa1P1s0o7W8D7utBPfPKg+kiYD2tD8/1vK6y7l3AKSPaenp/Ar8HPE05AaIpdXWo8xLgP5tQGzAX+ClwMq2zBdcDlzbhcQZcTetLH4enPwx8sFf7DFgAbB3rcQX8I/DuTv3GczlajvR/KyIWAOcADwJ9mfk8QPn7xh7VdExEPAbsATYAPwZezMyDpctuWk+ObvssrQf5/5bpNzSkLmh9UvtfI2Jz+WoO6P39+SbgZ8A/lSGxL0fEiQ2oa6TlwDfL9Z7WlpnPAp8CngGeB/YDm2nG42wr8PaIeENEnABcTuvDo025P0erY/iFdNiE9t9RFfoRMRP4NvCBzPxFr+sZlpm/yda/3fNo/St5eqdu3awpIt4J7MnMze3NHbr26pzeCzPzXFr/yl4XEW/vUR3tZgDnAl/KzHOAl+jNENOoytj4u4B/7nUtAGUcehlwGvD7wIm07tORuv44y8zttIaZNgDfBx6nNVTcdIf1PD1qQj8iXksr8L+RmXeX5hciYk6ZP4fWkXbPZOaLwBCt9x1mRcTwh+N68XUUFwLviohdwCCtIZ7PNqAuADLzufJ3D62x6fPp/f25G9idmQ+W6btovQj0uq52lwE/zMwXynSva3sH8HRm/iwzfw3cDfwJzXmcrc3MczPz7cBeYAe932fDRqvjsL7O5qgI/YgIYC2wPTM/0zZrHbCiXF9Ba6y/27WdGhGzyvXjaT0JtgObgKt6VVtm3piZ8zJzAa3hgPsz85pe1wUQESdGxOuHr9Mao95Kj+/PzPwv4KcR8ZbSdDGtrwXv+eOszbv53dAO9L62Z4DFEXFCeZ4O77OeP84AIuKN5e8fAH9Oa9/1ep8NG62OdcB7y1k8i4H9w8NA49LtN0+m6Q2QP6X1780TwGPlcjmtMeqNtF69NwIn96C2PwYeLbVtBf6+tL8JeAjYSetf8WN7uP8GgPVNqavU8Hi5bAP+rrQ34f48G3ik3J//AsxuQl2lthOA/wZOamvreW3AR4Eflcf/14Bjm/A4K7X9B60XoceBi3u1z2i92DwP/JrWkfzK0eqgNbzzBVrvDW6hdWbUuNfl1zBIUkWOiuEdSdL4GPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIv8HcUE1AMKbz9IAAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 432x288 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "tmp = df[\"PatientID\"].value_counts()\n",
+    "tmp[tmp<100].hist(bins=50);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "18938\n"
+     ]
+    }
+   ],
+   "source": [
+    "patient_id_train = set(df[\"PatientID\"].unique())\n",
+    "print(len(patient_id_train))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Image</th>\n",
+       "      <th>SOPInstanceUID</th>\n",
+       "      <th>PatientID</th>\n",
+       "      <th>any</th>\n",
+       "      <th>epidural</th>\n",
+       "      <th>intraparenchymal</th>\n",
+       "      <th>intraventricular</th>\n",
+       "      <th>subarachnoid</th>\n",
+       "      <th>subdural</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ID_45785016b</td>\n",
+       "      <td>ID_45785016b</td>\n",
+       "      <td>ID_0002cd41</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ID_37f32aed2</td>\n",
+       "      <td>ID_37f32aed2</td>\n",
+       "      <td>ID_0002cd41</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>ID_1b9de2922</td>\n",
+       "      <td>ID_1b9de2922</td>\n",
+       "      <td>ID_0002cd41</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>ID_d61a6a7b9</td>\n",
+       "      <td>ID_d61a6a7b9</td>\n",
+       "      <td>ID_0002cd41</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>ID_406c82112</td>\n",
+       "      <td>ID_406c82112</td>\n",
+       "      <td>ID_0002cd41</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          Image SOPInstanceUID    PatientID  any  epidural  intraparenchymal  \\\n",
+       "0  ID_45785016b   ID_45785016b  ID_0002cd41  0.0       0.0               0.0   \n",
+       "1  ID_37f32aed2   ID_37f32aed2  ID_0002cd41  0.0       0.0               0.0   \n",
+       "2  ID_1b9de2922   ID_1b9de2922  ID_0002cd41  0.0       0.0               0.0   \n",
+       "3  ID_d61a6a7b9   ID_d61a6a7b9  ID_0002cd41  0.0       0.0               0.0   \n",
+       "4  ID_406c82112   ID_406c82112  ID_0002cd41  0.0       0.0               0.0   \n",
+       "\n",
+       "   intraventricular  subarachnoid  subdural  \n",
+       "0               0.0           0.0       0.0  \n",
+       "1               0.0           0.0       0.0  \n",
+       "2               0.0           0.0       0.0  \n",
+       "3               0.0           0.0       0.0  \n",
+       "4               0.0           0.0       0.0  "
+      ]
+     },
+     "execution_count": 58,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train = train_df[['SOPInstanceUID','PatientID','any','epidural','intraparenchymal','intraventricular','subarachnoid','subdural']]\n",
+    "train.reset_index(inplace=True)\n",
+    "train.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(752802, 9)\n",
+      "(752802, 9)\n"
+     ]
+    }
+   ],
+   "source": [
+    "IGNORE_IDS = ['ID_6431af929']\n",
+    "print(train.shape)\n",
+    "train = train[~train['SOPInstanceUID'].isin(IGNORE_IDS)]\n",
+    "print(train.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(18938,)\n"
+     ]
+    }
+   ],
+   "source": [
+    "patient_id = train[\"PatientID\"].unique() \n",
+    "print(patient_id.shape) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(151865, 10)\n",
+      "(148063, 10)\n",
+      "(151306, 10)\n",
+      "(150597, 10)\n",
+      "(150971, 10)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import KFold\n",
+    "\n",
+    "seed = 2020\n",
+    "n_splits = 5\n",
+    "kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed) # StratifiedKFold\n",
+    "\n",
+    "fold = 0\n",
+    "for train_index, valid_index in kf.split(patient_id):\n",
+    "    df_train = train[train[\"PatientID\"].isin(patient_id[train_index])]\n",
+    "    df_valid = train[train[\"PatientID\"].isin(patient_id[valid_index])]\n",
+    "    \n",
+    "    df_train.to_csv(\"train_{}.csv\".format(fold), index=None)\n",
+    "    df_valid.to_csv(\"valid_{}.csv\".format(fold), index=None)\n",
+    "    \n",
+    "    fold += 1\n",
+    "    print(df_valid.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Image</th>\n",
+       "      <th>SOPInstanceUID</th>\n",
+       "      <th>PatientID</th>\n",
+       "      <th>any</th>\n",
+       "      <th>epidural</th>\n",
+       "      <th>intraparenchymal</th>\n",
+       "      <th>intraventricular</th>\n",
+       "      <th>subarachnoid</th>\n",
+       "      <th>subdural</th>\n",
+       "      <th>set</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>ID_138d275c8</td>\n",
+       "      <td>ID_138d275c8</td>\n",
+       "      <td>ID_00054f3f</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>ID_447fa09d9</td>\n",
+       "      <td>ID_447fa09d9</td>\n",
+       "      <td>ID_00054f3f</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>ID_0f1298f68</td>\n",
+       "      <td>ID_0f1298f68</td>\n",
+       "      <td>ID_00054f3f</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>ID_c24918b79</td>\n",
+       "      <td>ID_c24918b79</td>\n",
+       "      <td>ID_00054f3f</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>40</th>\n",
+       "      <td>ID_c0005a263</td>\n",
+       "      <td>ID_c0005a263</td>\n",
+       "      <td>ID_00054f3f</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>752797</th>\n",
+       "      <td>ID_72e823e2c</td>\n",
+       "      <td>ID_72e823e2c</td>\n",
+       "      <td>ID_fffc2bd6</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>752798</th>\n",
+       "      <td>ID_4184c4f03</td>\n",
+       "      <td>ID_4184c4f03</td>\n",
+       "      <td>ID_fffc2bd6</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>752799</th>\n",
+       "      <td>ID_a8aca4f40</td>\n",
+       "      <td>ID_a8aca4f40</td>\n",
+       "      <td>ID_fffc2bd6</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>752800</th>\n",
+       "      <td>ID_716b72762</td>\n",
+       "      <td>ID_716b72762</td>\n",
+       "      <td>ID_fffc2bd6</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>752801</th>\n",
+       "      <td>ID_deb85caf0</td>\n",
+       "      <td>ID_deb85caf0</td>\n",
+       "      <td>ID_fffc2bd6</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>601831 rows × 10 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               Image SOPInstanceUID    PatientID  any  epidural  \\\n",
+       "36      ID_138d275c8   ID_138d275c8  ID_00054f3f  0.0       0.0   \n",
+       "37      ID_447fa09d9   ID_447fa09d9  ID_00054f3f  0.0       0.0   \n",
+       "38      ID_0f1298f68   ID_0f1298f68  ID_00054f3f  0.0       0.0   \n",
+       "39      ID_c24918b79   ID_c24918b79  ID_00054f3f  0.0       0.0   \n",
+       "40      ID_c0005a263   ID_c0005a263  ID_00054f3f  0.0       0.0   \n",
+       "...              ...            ...          ...  ...       ...   \n",
+       "752797  ID_72e823e2c   ID_72e823e2c  ID_fffc2bd6  0.0       0.0   \n",
+       "752798  ID_4184c4f03   ID_4184c4f03  ID_fffc2bd6  0.0       0.0   \n",
+       "752799  ID_a8aca4f40   ID_a8aca4f40  ID_fffc2bd6  0.0       0.0   \n",
+       "752800  ID_716b72762   ID_716b72762  ID_fffc2bd6  0.0       0.0   \n",
+       "752801  ID_deb85caf0   ID_deb85caf0  ID_fffc2bd6  0.0       0.0   \n",
+       "\n",
+       "        intraparenchymal  intraventricular  subarachnoid  subdural  set  \n",
+       "36                   0.0               0.0           0.0       0.0    0  \n",
+       "37                   0.0               0.0           0.0       0.0    0  \n",
+       "38                   0.0               0.0           0.0       0.0    0  \n",
+       "39                   0.0               0.0           0.0       0.0    0  \n",
+       "40                   0.0               0.0           0.0       0.0    0  \n",
+       "...                  ...               ...           ...       ...  ...  \n",
+       "752797               0.0               0.0           0.0       0.0    0  \n",
+       "752798               0.0               0.0           0.0       0.0    0  \n",
+       "752799               0.0               0.0           0.0       0.0    0  \n",
+       "752800               0.0               0.0           0.0       0.0    0  \n",
+       "752801               0.0               0.0           0.0       0.0    0  \n",
+       "\n",
+       "[601831 rows x 10 columns]"
+      ]
+     },
+     "execution_count": 56,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}