[fc9ccf]: / 2-preprocess-pickle.ipynb

Download this file

1277 lines (1276 with data), 41.9 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "from collections import defaultdict\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "stage = \"stage_2\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(f\"data/{stage}_train_dicom_diags.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>BitsAllocated</th>\n",
       "      <th>BitsStored</th>\n",
       "      <th>Columns</th>\n",
       "      <th>HighBit</th>\n",
       "      <th>ImageOrientationPatient_0</th>\n",
       "      <th>ImageOrientationPatient_1</th>\n",
       "      <th>ImageOrientationPatient_2</th>\n",
       "      <th>ImageOrientationPatient_3</th>\n",
       "      <th>ImageOrientationPatient_4</th>\n",
       "      <th>...</th>\n",
       "      <th>WindowWidth</th>\n",
       "      <th>WindowWidth_0</th>\n",
       "      <th>WindowWidth_1</th>\n",
       "      <th>fid</th>\n",
       "      <th>any</th>\n",
       "      <th>epidural</th>\n",
       "      <th>intraparenchymal</th>\n",
       "      <th>intraventricular</th>\n",
       "      <th>subarachnoid</th>\n",
       "      <th>subdural</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>16</td>\n",
       "      <td>512</td>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.927184</td>\n",
       "      <td>...</td>\n",
       "      <td>80.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ID_000012eaf</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>16</td>\n",
       "      <td>16</td>\n",
       "      <td>512</td>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.968148</td>\n",
       "      <td>...</td>\n",
       "      <td>80.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ID_000039fa0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>16</td>\n",
       "      <td>16</td>\n",
       "      <td>512</td>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>100.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ID_00005679d</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>16</td>\n",
       "      <td>12</td>\n",
       "      <td>512</td>\n",
       "      <td>11</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.994522</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>ID_00008ce3c</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>16</td>\n",
       "      <td>16</td>\n",
       "      <td>512</td>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>135.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ID_0000950d7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 41 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0  BitsAllocated  BitsStored  Columns  HighBit  \\\n",
       "0           0             16          16      512       15   \n",
       "1           1             16          16      512       15   \n",
       "2           2             16          16      512       15   \n",
       "3           3             16          12      512       11   \n",
       "4           4             16          16      512       15   \n",
       "\n",
       "   ImageOrientationPatient_0  ImageOrientationPatient_1  \\\n",
       "0                        1.0                        0.0   \n",
       "1                        1.0                        0.0   \n",
       "2                        1.0                        0.0   \n",
       "3                        1.0                        0.0   \n",
       "4                        1.0                        0.0   \n",
       "\n",
       "   ImageOrientationPatient_2  ImageOrientationPatient_3  \\\n",
       "0                        0.0                        0.0   \n",
       "1                        0.0                        0.0   \n",
       "2                        0.0                        0.0   \n",
       "3                        0.0                        0.0   \n",
       "4                        0.0                        0.0   \n",
       "\n",
       "   ImageOrientationPatient_4  ...  WindowWidth  WindowWidth_0  WindowWidth_1  \\\n",
       "0                   0.927184  ...         80.0            NaN            NaN   \n",
       "1                   0.968148  ...         80.0            NaN            NaN   \n",
       "2                   1.000000  ...        100.0            NaN            NaN   \n",
       "3                   0.994522  ...          NaN           80.0           80.0   \n",
       "4                   1.000000  ...        135.0            NaN            NaN   \n",
       "\n",
       "            fid any epidural intraparenchymal  intraventricular  subarachnoid  \\\n",
       "0  ID_000012eaf   0        0                0                 0             0   \n",
       "1  ID_000039fa0   0        0                0                 0             0   \n",
       "2  ID_00005679d   0        0                0                 0             0   \n",
       "3  ID_00008ce3c   0        0                0                 0             0   \n",
       "4  ID_0000950d7   0        0                0                 0             0   \n",
       "\n",
       "   subdural  \n",
       "0         0  \n",
       "1         0  \n",
       "2         0  \n",
       "3         0  \n",
       "4         0  \n",
       "\n",
       "[5 rows x 41 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "21744"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sort, then group by (order is preserved within groups)\n",
    "gs = df.sort_values('ImagePositionPatient_2').groupby('SeriesInstanceUID')\n",
    "len(gs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ImagePositionPatient_2</th>\n",
       "      <th>fid</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>577964</th>\n",
       "      <td>193.542489</td>\n",
       "      <td>ID_c45659d3d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>229790</th>\n",
       "      <td>198.214051</td>\n",
       "      <td>ID_4e0bdd2ba</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22395</th>\n",
       "      <td>202.885613</td>\n",
       "      <td>ID_079945c27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>746126</th>\n",
       "      <td>207.557174</td>\n",
       "      <td>ID_fdbfb2c17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>253266</th>\n",
       "      <td>212.228736</td>\n",
       "      <td>ID_55f7bbbf2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        ImagePositionPatient_2           fid\n",
       "577964              193.542489  ID_c45659d3d\n",
       "229790              198.214051  ID_4e0bdd2ba\n",
       "22395               202.885613  ID_079945c27\n",
       "746126              207.557174  ID_fdbfb2c17\n",
       "253266              212.228736  ID_55f7bbbf2"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# see if it worked\n",
    "gs.get_group('ID_fa19cd5ea9')[['ImagePositionPatient_2', 'fid']].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "g = gs.get_group('ID_fa19cd5ea9')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "subg = g[['SeriesInstanceUID', 'fid', 'any', 'epidural', \n",
    "          'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SeriesInstanceUID</th>\n",
       "      <th>fid</th>\n",
       "      <th>any</th>\n",
       "      <th>epidural</th>\n",
       "      <th>intraparenchymal</th>\n",
       "      <th>intraventricular</th>\n",
       "      <th>subarachnoid</th>\n",
       "      <th>subdural</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>577964</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_c45659d3d</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>229790</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_4e0bdd2ba</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22395</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_079945c27</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>746126</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_fdbfb2c17</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>253266</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_55f7bbbf2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>549211</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_ba7080372</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>592856</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_c964e4096</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>183149</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_3e31d57d0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>306771</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_680b2194c</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>540358</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_b76b13444</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>645217</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_db48a633d</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>270974</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_5bf2ca43f</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>672814</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_e4b636907</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>350834</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_7714ead69</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>749886</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_ff012ee5b</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>523978</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_b1cea5abb</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>464942</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_9dad2eb09</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>229881</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_4e14d0fe8</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>186237</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_3f422852d</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>599624</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_cbbb50e6d</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>347055</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_75cbdae68</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>359450</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_7a02fdbea</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127205</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_2b3671dd9</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148587</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_3274f5977</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>413641</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_8c5fc9e44</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>688538</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_ea2861e9a</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>318670</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_6c19c9f7b</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>630472</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_d6435f3bf</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202656</th>\n",
       "      <td>ID_fa19cd5ea9</td>\n",
       "      <td>ID_44d57858e</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       SeriesInstanceUID           fid  any  epidural  intraparenchymal  \\\n",
       "577964     ID_fa19cd5ea9  ID_c45659d3d    0         0                 0   \n",
       "229790     ID_fa19cd5ea9  ID_4e0bdd2ba    0         0                 0   \n",
       "22395      ID_fa19cd5ea9  ID_079945c27    1         0                 0   \n",
       "746126     ID_fa19cd5ea9  ID_fdbfb2c17    1         0                 0   \n",
       "253266     ID_fa19cd5ea9  ID_55f7bbbf2    1         0                 0   \n",
       "549211     ID_fa19cd5ea9  ID_ba7080372    1         0                 0   \n",
       "592856     ID_fa19cd5ea9  ID_c964e4096    1         0                 0   \n",
       "183149     ID_fa19cd5ea9  ID_3e31d57d0    1         0                 0   \n",
       "306771     ID_fa19cd5ea9  ID_680b2194c    1         0                 0   \n",
       "540358     ID_fa19cd5ea9  ID_b76b13444    1         0                 0   \n",
       "645217     ID_fa19cd5ea9  ID_db48a633d    1         0                 0   \n",
       "270974     ID_fa19cd5ea9  ID_5bf2ca43f    1         0                 0   \n",
       "672814     ID_fa19cd5ea9  ID_e4b636907    1         0                 0   \n",
       "350834     ID_fa19cd5ea9  ID_7714ead69    1         0                 0   \n",
       "749886     ID_fa19cd5ea9  ID_ff012ee5b    1         0                 0   \n",
       "523978     ID_fa19cd5ea9  ID_b1cea5abb    1         0                 0   \n",
       "464942     ID_fa19cd5ea9  ID_9dad2eb09    1         0                 0   \n",
       "229881     ID_fa19cd5ea9  ID_4e14d0fe8    1         0                 0   \n",
       "186237     ID_fa19cd5ea9  ID_3f422852d    1         0                 0   \n",
       "599624     ID_fa19cd5ea9  ID_cbbb50e6d    1         0                 0   \n",
       "347055     ID_fa19cd5ea9  ID_75cbdae68    1         0                 0   \n",
       "359450     ID_fa19cd5ea9  ID_7a02fdbea    1         0                 0   \n",
       "127205     ID_fa19cd5ea9  ID_2b3671dd9    1         0                 0   \n",
       "148587     ID_fa19cd5ea9  ID_3274f5977    0         0                 0   \n",
       "413641     ID_fa19cd5ea9  ID_8c5fc9e44    0         0                 0   \n",
       "688538     ID_fa19cd5ea9  ID_ea2861e9a    0         0                 0   \n",
       "318670     ID_fa19cd5ea9  ID_6c19c9f7b    0         0                 0   \n",
       "630472     ID_fa19cd5ea9  ID_d6435f3bf    0         0                 0   \n",
       "202656     ID_fa19cd5ea9  ID_44d57858e    0         0                 0   \n",
       "\n",
       "        intraventricular  subarachnoid  subdural  \n",
       "577964                 0             0         0  \n",
       "229790                 0             0         0  \n",
       "22395                  0             0         1  \n",
       "746126                 0             0         1  \n",
       "253266                 0             0         1  \n",
       "549211                 0             0         1  \n",
       "592856                 0             0         1  \n",
       "183149                 0             0         1  \n",
       "306771                 0             0         1  \n",
       "540358                 0             0         1  \n",
       "645217                 0             0         1  \n",
       "270974                 0             0         1  \n",
       "672814                 0             0         1  \n",
       "350834                 0             0         1  \n",
       "749886                 0             0         1  \n",
       "523978                 0             0         1  \n",
       "464942                 0             0         1  \n",
       "229881                 0             0         1  \n",
       "186237                 0             0         1  \n",
       "599624                 0             0         1  \n",
       "347055                 0             0         1  \n",
       "359450                 0             0         1  \n",
       "127205                 0             0         1  \n",
       "148587                 0             0         0  \n",
       "413641                 0             0         0  \n",
       "688538                 0             0         0  \n",
       "318670                 0             0         0  \n",
       "630472                 0             0         0  \n",
       "202656                 0             0         0  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# You can use a btrfs snapshot and rename files by study_id and z-pos through the brain\n",
    "def rename_train_group(subg):\n",
    "    ix = 0\n",
    "    total = len(subg)\n",
    "    for index, row in subg.iterrows():\n",
    "        cur_fn = row['fid']\n",
    "        new_fn = f\"{row['SeriesInstanceUID']}_{ix:03}_{total:03}_{row['any']}_{row['epidural']}_{row['intraparenchymal']}_{row['intraventricular']}_{row['subarachnoid']}_{row['subdural']}_{cur_fn}\"\n",
    "        ix += 1\n",
    "        Path(f'data/unzip_renamed/{stage}_train_images/{cur_fn}.dcm').rename(f'data/unzip_renamed/{stage}_train_images/{new_fn}.dcm')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def index_group(subg, study_ix_to_fn, fn_to_study_ix):\n",
    "    ix = 0\n",
    "    for index, row in subg.iterrows():\n",
    "        fn = row['SOPInstanceUID']\n",
    "        study = row['SeriesInstanceUID']\n",
    "        study_ix_to_fn[study].append(fn)\n",
    "        fn_to_study_ix[fn] = (study, ix)\n",
    "        ix += 1  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = [ 'any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural' ]\n",
    "\n",
    "def label_group(subg, fn_to_labels):\n",
    "    for index, row in subg.iterrows():\n",
    "        fn = row['SOPInstanceUID']\n",
    "        fn_to_labels[fn] = [ label for label in labels if row[label] == 1 ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_study_ix_to_fn = defaultdict(list)\n",
    "train_fn_to_study_ix = {}\n",
    "train_fn_to_labels = {}\n",
    "\n",
    "for name, subg in gs:\n",
    "    #rename_train_group(subg)\n",
    "    index_group(subg, train_study_ix_to_fn, train_fn_to_study_ix)\n",
    "    label_group(subg, train_fn_to_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Do not pickle yet, merge with test\n",
    "pickle.dump(train_study_ix_to_fn, open(f\"data/{stage}_train_study_ix_to_fn.pickle\", \"wb\" ))\n",
    "pickle.dump(train_fn_to_study_ix, open(f\"data/{stage}_train_fn_to_study_ix.pickle\", \"wb\" ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['any', 'subdural']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_fn_to_labels['ID_079945c27']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "pickle.dump(train_fn_to_labels, open(f\"data/{stage}_train_fn_to_labels.pickle\", 'wb'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(f\"data/{stage}_test_dicom.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>BitsAllocated</th>\n",
       "      <th>BitsStored</th>\n",
       "      <th>Columns</th>\n",
       "      <th>HighBit</th>\n",
       "      <th>ImageOrientationPatient_0</th>\n",
       "      <th>ImageOrientationPatient_1</th>\n",
       "      <th>ImageOrientationPatient_2</th>\n",
       "      <th>ImageOrientationPatient_3</th>\n",
       "      <th>ImageOrientationPatient_4</th>\n",
       "      <th>...</th>\n",
       "      <th>SamplesPerPixel</th>\n",
       "      <th>SeriesInstanceUID</th>\n",
       "      <th>StudyID</th>\n",
       "      <th>StudyInstanceUID</th>\n",
       "      <th>WindowCenter</th>\n",
       "      <th>WindowCenter_0</th>\n",
       "      <th>WindowCenter_1</th>\n",
       "      <th>WindowWidth</th>\n",
       "      <th>WindowWidth_0</th>\n",
       "      <th>WindowWidth_1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>12</td>\n",
       "      <td>512</td>\n",
       "      <td>11</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.981627</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>ID_4d28912ba6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ID_1f6d1e8aeb</td>\n",
       "      <td>NaN</td>\n",
       "      <td>40.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "      <td>80.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>16</td>\n",
       "      <td>16</td>\n",
       "      <td>512</td>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.987688</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>ID_acabdeee86</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ID_4a8d7ec19f</td>\n",
       "      <td>30.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>16</td>\n",
       "      <td>16</td>\n",
       "      <td>512</td>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.927184</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>ID_d00cee7f0c</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ID_a6ca244172</td>\n",
       "      <td>30.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>16</td>\n",
       "      <td>16</td>\n",
       "      <td>512</td>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.986286</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>ID_a52a0112d5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ID_fa950a03af</td>\n",
       "      <td>30.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>16</td>\n",
       "      <td>12</td>\n",
       "      <td>512</td>\n",
       "      <td>11</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>ID_f552d3b922</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ID_965d8b3d8e</td>\n",
       "      <td>NaN</td>\n",
       "      <td>36.0</td>\n",
       "      <td>36.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "      <td>80.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 34 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0  BitsAllocated  BitsStored  Columns  HighBit  \\\n",
       "0           0             16          12      512       11   \n",
       "1           1             16          16      512       15   \n",
       "2           2             16          16      512       15   \n",
       "3           3             16          16      512       15   \n",
       "4           4             16          12      512       11   \n",
       "\n",
       "   ImageOrientationPatient_0  ImageOrientationPatient_1  \\\n",
       "0                        1.0                        0.0   \n",
       "1                        1.0                        0.0   \n",
       "2                        1.0                        0.0   \n",
       "3                        1.0                        0.0   \n",
       "4                        1.0                        0.0   \n",
       "\n",
       "   ImageOrientationPatient_2  ImageOrientationPatient_3  \\\n",
       "0                        0.0                        0.0   \n",
       "1                        0.0                        0.0   \n",
       "2                        0.0                        0.0   \n",
       "3                        0.0                        0.0   \n",
       "4                        0.0                        0.0   \n",
       "\n",
       "   ImageOrientationPatient_4  ...  SamplesPerPixel  SeriesInstanceUID  \\\n",
       "0                   0.981627  ...                1      ID_4d28912ba6   \n",
       "1                   0.987688  ...                1      ID_acabdeee86   \n",
       "2                   0.927184  ...                1      ID_d00cee7f0c   \n",
       "3                   0.986286  ...                1      ID_a52a0112d5   \n",
       "4                   1.000000  ...                1      ID_f552d3b922   \n",
       "\n",
       "   StudyID  StudyInstanceUID WindowCenter WindowCenter_0 WindowCenter_1  \\\n",
       "0      NaN     ID_1f6d1e8aeb          NaN           40.0           40.0   \n",
       "1      NaN     ID_4a8d7ec19f         30.0            NaN            NaN   \n",
       "2      NaN     ID_a6ca244172         30.0            NaN            NaN   \n",
       "3      NaN     ID_fa950a03af         30.0            NaN            NaN   \n",
       "4      NaN     ID_965d8b3d8e          NaN           36.0           36.0   \n",
       "\n",
       "   WindowWidth  WindowWidth_0  WindowWidth_1  \n",
       "0          NaN           80.0           80.0  \n",
       "1         80.0            NaN            NaN  \n",
       "2         80.0            NaN            NaN  \n",
       "3         80.0            NaN            NaN  \n",
       "4          NaN           80.0           80.0  \n",
       "\n",
       "[5 rows x 34 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3518"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sort, then group by (order is preserver within groups)\n",
    "gs = df.sort_values('ImagePositionPatient_2').groupby('SeriesInstanceUID')\n",
    "len(gs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def rename_test_group(subg):\n",
    "    ix = 0\n",
    "    total = len(subg)\n",
    "    for index, row in subg.iterrows():\n",
    "        cur_fn = row['SOPInstanceUID']\n",
    "        new_fn = f\"{row['SeriesInstanceUID']}_{ix:03}_{total:03}_{cur_fn}\"\n",
    "        ix += 1\n",
    "        Path(f'data/unzip_renamed/{stage}_test_images/{cur_fn}.dcm').rename(f'data/unzip_renamed/{stage}_test_images/{new_fn}.dcm')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_study_ix_to_fn = defaultdict(list)\n",
    "test_fn_to_study_ix = {}\n",
    "\n",
    "for name, subg in gs:\n",
    "    #rename_test_group(subg)\n",
    "    index_group(subg, test_study_ix_to_fn, test_fn_to_study_ix)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "pickle.dump(test_study_ix_to_fn, open(f\"data/{stage}_test_study_ix_to_fn.pickle\", \"wb\" ))\n",
    "pickle.dump(test_fn_to_study_ix, open(f\"data/{stage}_test_fn_to_study_ix.pickle\", \"wb\" ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "study_ix_to_fn = { **train_study_ix_to_fn, **test_study_ix_to_fn }\n",
    "fn_to_study_ix = { **train_fn_to_study_ix, **test_fn_to_study_ix }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "pickle.dump(study_ix_to_fn, open(f\"data/{stage}_study_ix_to_fn.pickle\", \"wb\" ))\n",
    "pickle.dump(fn_to_study_ix, open(f\"data/{stage}_fn_to_study_ix.pickle\", \"wb\" ))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}