Switch to side-by-side view

--- a
+++ b/0-preprocess-generate_csvs.ipynb
@@ -0,0 +1,1596 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from pathlib import Path\n",
+    "from tqdm import tqdm_notebook\n",
+    "import pydicom\n",
+    "import itertools\n",
+    "import numpy as np\n",
+    "from concurrent.futures import ProcessPoolExecutor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "tqdm().pandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Read stage_X_train and split id/label"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stage = \"stage_2\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bad_dcm_fn = f'data/unzip/{stage}_train_images/ID_6431af929.dcm'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rm: cannot remove 'data/unzip/stage_2_train_images/ID_6431af929.dcm': No such file or directory\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!rm {bad_dcm_fn}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train = pd.read_csv(f'data/unzip/{stage}_train.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ID</th>\n",
+       "      <th>Label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ID_12cadc6af_epidural</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ID_12cadc6af_intraparenchymal</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>ID_12cadc6af_intraventricular</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>ID_12cadc6af_subarachnoid</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>ID_12cadc6af_subdural</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                              ID  Label\n",
+       "0          ID_12cadc6af_epidural      0\n",
+       "1  ID_12cadc6af_intraparenchymal      0\n",
+       "2  ID_12cadc6af_intraventricular      0\n",
+       "3      ID_12cadc6af_subarachnoid      0\n",
+       "4          ID_12cadc6af_subdural      0"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_train.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train['fid'] = df_train.ID.apply(lambda x: '_'.join(x.split('_')[:2]) )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train.columns = ['ID', 'probability', 'fid']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train['label'] = df_train.ID.apply(lambda x: x.split('_')[-1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ID</th>\n",
+       "      <th>probability</th>\n",
+       "      <th>fid</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ID_12cadc6af_epidural</td>\n",
+       "      <td>0</td>\n",
+       "      <td>ID_12cadc6af</td>\n",
+       "      <td>epidural</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ID_12cadc6af_intraparenchymal</td>\n",
+       "      <td>0</td>\n",
+       "      <td>ID_12cadc6af</td>\n",
+       "      <td>intraparenchymal</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>ID_12cadc6af_intraventricular</td>\n",
+       "      <td>0</td>\n",
+       "      <td>ID_12cadc6af</td>\n",
+       "      <td>intraventricular</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>ID_12cadc6af_subarachnoid</td>\n",
+       "      <td>0</td>\n",
+       "      <td>ID_12cadc6af</td>\n",
+       "      <td>subarachnoid</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>ID_12cadc6af_subdural</td>\n",
+       "      <td>0</td>\n",
+       "      <td>ID_12cadc6af</td>\n",
+       "      <td>subdural</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                              ID  probability           fid             label\n",
+       "0          ID_12cadc6af_epidural            0  ID_12cadc6af          epidural\n",
+       "1  ID_12cadc6af_intraparenchymal            0  ID_12cadc6af  intraparenchymal\n",
+       "2  ID_12cadc6af_intraventricular            0  ID_12cadc6af  intraventricular\n",
+       "3      ID_12cadc6af_subarachnoid            0  ID_12cadc6af      subarachnoid\n",
+       "4          ID_12cadc6af_subdural            0  ID_12cadc6af          subdural"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_train.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Remove dupes "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4516842, 4)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_train.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train.drop_duplicates('ID', inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4516818, 4)"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_train.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Remove corrupted image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train = df_train[df_train.fid != 'ID_6431af929'] # ID_6431af929"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4516812, 4)"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_train.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create pivot table with diagnostic labels as columns\n",
+    "Generates:\n",
+    "* `train_diags.csv` (previously named `train_pivot.csv`)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_diags = df_train.pivot(index='fid', columns='label', values='probability')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>label</th>\n",
+       "      <th>any</th>\n",
+       "      <th>epidural</th>\n",
+       "      <th>intraparenchymal</th>\n",
+       "      <th>intraventricular</th>\n",
+       "      <th>subarachnoid</th>\n",
+       "      <th>subdural</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>fid</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>ID_000012eaf</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID_000039fa0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID_00005679d</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID_00008ce3c</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID_0000950d7</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "label         any  epidural  intraparenchymal  intraventricular  subarachnoid  \\\n",
+       "fid                                                                             \n",
+       "ID_000012eaf    0         0                 0                 0             0   \n",
+       "ID_000039fa0    0         0                 0                 0             0   \n",
+       "ID_00005679d    0         0                 0                 0             0   \n",
+       "ID_00008ce3c    0         0                 0                 0             0   \n",
+       "ID_0000950d7    0         0                 0                 0             0   \n",
+       "\n",
+       "label         subdural  \n",
+       "fid                     \n",
+       "ID_000012eaf         0  \n",
+       "ID_000039fa0         0  \n",
+       "ID_00005679d         0  \n",
+       "ID_00008ce3c         0  \n",
+       "ID_0000950d7         0  "
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_diags.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(752802, 6)"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_diags.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_diags.reset_index(inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>label</th>\n",
+       "      <th>fid</th>\n",
+       "      <th>any</th>\n",
+       "      <th>epidural</th>\n",
+       "      <th>intraparenchymal</th>\n",
+       "      <th>intraventricular</th>\n",
+       "      <th>subarachnoid</th>\n",
+       "      <th>subdural</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ID_000012eaf</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ID_000039fa0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>ID_00005679d</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>ID_00008ce3c</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>ID_0000950d7</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "label           fid  any  epidural  intraparenchymal  intraventricular  \\\n",
+       "0      ID_000012eaf    0         0                 0                 0   \n",
+       "1      ID_000039fa0    0         0                 0                 0   \n",
+       "2      ID_00005679d    0         0                 0                 0   \n",
+       "3      ID_00008ce3c    0         0                 0                 0   \n",
+       "4      ID_0000950d7    0         0                 0                 0   \n",
+       "\n",
+       "label  subarachnoid  subdural  \n",
+       "0                 0         0  \n",
+       "1                 0         0  \n",
+       "2                 0         0  \n",
+       "3                 0         0  \n",
+       "4                 0         0  "
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_diags.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(752802, 7)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_diags.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_diags.to_csv(f'data/{stage}_train_diags.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Generate fastai-ready csv image file (.png) -> labels\n",
+    "This is needed for early experiments that worked with the .png dataset.\n",
+    "\n",
+    "Generates:\n",
+    "* `train_labels_as_strings.csv`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import defaultdict\n",
+    "\n",
+    "d = defaultdict(list)\n",
+    "for fid in df_train.fid.unique(): d[fid]\n",
+    "\n",
+    "for tup in df_train.itertuples():\n",
+    "    if tup.probability: d[tup.fid].append(tup.label)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ks, vs = [], []\n",
+    "\n",
+    "for k, v in d.items():\n",
+    "    ks.append(k), vs.append(' '.join(v))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fastai_df = pd.DataFrame(data={'fn': ks, 'labels': vs})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(752802, 2)"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fastai_df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fastai_df['fn'] += '.png'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>fn</th>\n",
+       "      <th>labels</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ID_12cadc6af.png</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ID_38fd7baa0.png</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>ID_6c5d82413.png</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>ID_aec8e68b3.png</td>\n",
+       "      <td>subarachnoid any</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>ID_4d9209c7c.png</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                 fn            labels\n",
+       "0  ID_12cadc6af.png                  \n",
+       "1  ID_38fd7baa0.png                  \n",
+       "2  ID_6c5d82413.png                  \n",
+       "3  ID_aec8e68b3.png  subarachnoid any\n",
+       "4  ID_4d9209c7c.png                  "
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fastai_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fastai_df.to_csv(f'data/{stage}_train_labels_as_strings.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tabulate dicom data\n",
+    "Generates:\n",
+    "* `train_dicom.csv`\n",
+    "* `test_dicom.csv`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cols_i_want = ['BitsAllocated', 'BitsStored', 'Columns', 'HighBit', 'ImageOrientationPatient', \n",
+    "               'ImagePositionPatient', 'Modality', 'PatientID', 'PhotometricInterpretation', \n",
+    "               'PixelRepresentation', 'PixelSpacing', 'RescaleIntercept', 'RescaleSlope', \n",
+    "               'Rows', 'SOPInstanceUID', 'SamplesPerPixel', 'SeriesInstanceUID', 'StudyID', \n",
+    "               'StudyInstanceUID', 'WindowCenter', 'WindowWidth']\n",
+    "useless_cols = [ 'PixelData' ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dcm_list = list(Path(f'data/unzip/{stage}_train_images').glob('*.dcm'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f = train_dcm_list[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0008, 0018) SOP Instance UID                    UI: ID_000012eaf\n",
+       "(0008, 0060) Modality                            CS: 'CT'\n",
+       "(0010, 0020) Patient ID                          LO: 'ID_f15c0eee'\n",
+       "(0020, 000d) Study Instance UID                  UI: ID_30ea2b02d4\n",
+       "(0020, 000e) Series Instance UID                 UI: ID_0ab5820b2a\n",
+       "(0020, 0010) Study ID                            SH: ''\n",
+       "(0020, 0032) Image Position (Patient)            DS: ['-125.000000', '-115.897980', '77.970825']\n",
+       "(0020, 0037) Image Orientation (Patient)         DS: ['1.000000', '0.000000', '0.000000', '0.000000', '0.927184', '-0.374607']\n",
+       "(0028, 0002) Samples per Pixel                   US: 1\n",
+       "(0028, 0004) Photometric Interpretation          CS: 'MONOCHROME2'\n",
+       "(0028, 0010) Rows                                US: 512\n",
+       "(0028, 0011) Columns                             US: 512\n",
+       "(0028, 0030) Pixel Spacing                       DS: ['0.488281', '0.488281']\n",
+       "(0028, 0100) Bits Allocated                      US: 16\n",
+       "(0028, 0101) Bits Stored                         US: 16\n",
+       "(0028, 0102) High Bit                            US: 15\n",
+       "(0028, 0103) Pixel Representation                US: 1\n",
+       "(0028, 1050) Window Center                       DS: \"30\"\n",
+       "(0028, 1051) Window Width                        DS: \"80\"\n",
+       "(0028, 1052) Rescale Intercept                   DS: \"-1024\"\n",
+       "(0028, 1053) Rescale Slope                       DS: \"1\"\n",
+       "(7fe0, 0010) Pixel Data                          OW: Array of 524288 elements"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dicom = pydicom.dcmread(str(f))\n",
+    "dicom"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ID_f15c0eee'"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dicom.data_element('PatientID').value"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['-125.000000', '-115.897980', '77.970825']"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ipp = dicom.data_element('ImagePositionPatient').value\n",
+    "ipp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"-125.000000\""
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ipp[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "pydicom.multival.MultiValue"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type(ipp)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tabulate_dicom_data(file_list):\n",
+    "    l = []\n",
+    "    for f in file_list:\n",
+    "        dicom = pydicom.dcmread(str(f))\n",
+    "        d = {}\n",
+    "        for s in cols_i_want:\n",
+    "            v = dicom.data_element(s).value\n",
+    "            if isinstance(v, pydicom.multival.MultiValue):\n",
+    "                for i in range(len(v)):\n",
+    "                    d[s + '_' + str(i)] = v[i]\n",
+    "            else:\n",
+    "                d[s] = v\n",
+    "        l.append(d)\n",
+    "        \n",
+    "    return l\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with ProcessPoolExecutor(max_workers=32) as e:\n",
+    "     l = list(itertools.chain.from_iterable(e.map(tabulate_dicom_data, np.array_split(train_dcm_list, 32))))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_dicom = pd.DataFrame(l)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>BitsAllocated</th>\n",
+       "      <th>BitsStored</th>\n",
+       "      <th>Columns</th>\n",
+       "      <th>HighBit</th>\n",
+       "      <th>ImageOrientationPatient_0</th>\n",
+       "      <th>ImageOrientationPatient_1</th>\n",
+       "      <th>ImageOrientationPatient_2</th>\n",
+       "      <th>ImageOrientationPatient_3</th>\n",
+       "      <th>ImageOrientationPatient_4</th>\n",
+       "      <th>ImageOrientationPatient_5</th>\n",
+       "      <th>...</th>\n",
+       "      <th>SamplesPerPixel</th>\n",
+       "      <th>SeriesInstanceUID</th>\n",
+       "      <th>StudyID</th>\n",
+       "      <th>StudyInstanceUID</th>\n",
+       "      <th>WindowCenter</th>\n",
+       "      <th>WindowCenter_0</th>\n",
+       "      <th>WindowCenter_1</th>\n",
+       "      <th>WindowWidth</th>\n",
+       "      <th>WindowWidth_0</th>\n",
+       "      <th>WindowWidth_1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>16</td>\n",
+       "      <td>16</td>\n",
+       "      <td>512</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.927184</td>\n",
+       "      <td>-0.374607</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>ID_0ab5820b2a</td>\n",
+       "      <td></td>\n",
+       "      <td>ID_30ea2b02d4</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>16</td>\n",
+       "      <td>16</td>\n",
+       "      <td>512</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.968148</td>\n",
+       "      <td>-0.250380</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>ID_5f8484c3e0</td>\n",
+       "      <td></td>\n",
+       "      <td>ID_134d398b61</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>16</td>\n",
+       "      <td>16</td>\n",
+       "      <td>512</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>ID_203cd6ec46</td>\n",
+       "      <td></td>\n",
+       "      <td>ID_b5c26cda09</td>\n",
+       "      <td>50.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>16</td>\n",
+       "      <td>12</td>\n",
+       "      <td>512</td>\n",
+       "      <td>11</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.994522</td>\n",
+       "      <td>0.104528</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>ID_3780d48b28</td>\n",
+       "      <td></td>\n",
+       "      <td>ID_974735bf79</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>40.0</td>\n",
+       "      <td>40.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>80.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>16</td>\n",
+       "      <td>16</td>\n",
+       "      <td>512</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>ID_84296c3845</td>\n",
+       "      <td></td>\n",
+       "      <td>ID_8881b1c4b1</td>\n",
+       "      <td>35.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>135.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 33 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   BitsAllocated  BitsStored  Columns  HighBit  ImageOrientationPatient_0  \\\n",
+       "0             16          16      512       15                        1.0   \n",
+       "1             16          16      512       15                        1.0   \n",
+       "2             16          16      512       15                        1.0   \n",
+       "3             16          12      512       11                        1.0   \n",
+       "4             16          16      512       15                        1.0   \n",
+       "\n",
+       "   ImageOrientationPatient_1  ImageOrientationPatient_2  \\\n",
+       "0                        0.0                        0.0   \n",
+       "1                        0.0                        0.0   \n",
+       "2                        0.0                        0.0   \n",
+       "3                        0.0                        0.0   \n",
+       "4                        0.0                        0.0   \n",
+       "\n",
+       "   ImageOrientationPatient_3  ImageOrientationPatient_4  \\\n",
+       "0                        0.0                   0.927184   \n",
+       "1                        0.0                   0.968148   \n",
+       "2                        0.0                   1.000000   \n",
+       "3                        0.0                   0.994522   \n",
+       "4                        0.0                   1.000000   \n",
+       "\n",
+       "   ImageOrientationPatient_5  ...  SamplesPerPixel  SeriesInstanceUID  \\\n",
+       "0                  -0.374607  ...                1      ID_0ab5820b2a   \n",
+       "1                  -0.250380  ...                1      ID_5f8484c3e0   \n",
+       "2                   0.000000  ...                1      ID_203cd6ec46   \n",
+       "3                   0.104528  ...                1      ID_3780d48b28   \n",
+       "4                   0.000000  ...                1      ID_84296c3845   \n",
+       "\n",
+       "   StudyID StudyInstanceUID WindowCenter WindowCenter_0  WindowCenter_1  \\\n",
+       "0             ID_30ea2b02d4         30.0            NaN             NaN   \n",
+       "1             ID_134d398b61         30.0            NaN             NaN   \n",
+       "2             ID_b5c26cda09         50.0            NaN             NaN   \n",
+       "3             ID_974735bf79          NaN           40.0            40.0   \n",
+       "4             ID_8881b1c4b1         35.0            NaN             NaN   \n",
+       "\n",
+       "   WindowWidth  WindowWidth_0  WindowWidth_1  \n",
+       "0         80.0            NaN            NaN  \n",
+       "1         80.0            NaN            NaN  \n",
+       "2        100.0            NaN            NaN  \n",
+       "3          NaN           80.0           80.0  \n",
+       "4        135.0            NaN            NaN  \n",
+       "\n",
+       "[5 rows x 33 columns]"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_train_dicom.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_dicom.to_csv(f'data/{stage}_train_dicom.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_dcm_list = list(Path(f'data/unzip/{stage}_test_images').glob('*.dcm'))\n",
+    "with ProcessPoolExecutor(max_workers=32) as e:\n",
+    "     l = list(itertools.chain.from_iterable(e.map(tabulate_dicom_data, np.array_split(test_dcm_list, 32))))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>BitsAllocated</th>\n",
+       "      <th>BitsStored</th>\n",
+       "      <th>Columns</th>\n",
+       "      <th>HighBit</th>\n",
+       "      <th>ImageOrientationPatient_0</th>\n",
+       "      <th>ImageOrientationPatient_1</th>\n",
+       "      <th>ImageOrientationPatient_2</th>\n",
+       "      <th>ImageOrientationPatient_3</th>\n",
+       "      <th>ImageOrientationPatient_4</th>\n",
+       "      <th>ImageOrientationPatient_5</th>\n",
+       "      <th>...</th>\n",
+       "      <th>SamplesPerPixel</th>\n",
+       "      <th>SeriesInstanceUID</th>\n",
+       "      <th>StudyID</th>\n",
+       "      <th>StudyInstanceUID</th>\n",
+       "      <th>WindowCenter</th>\n",
+       "      <th>WindowCenter_0</th>\n",
+       "      <th>WindowCenter_1</th>\n",
+       "      <th>WindowWidth</th>\n",
+       "      <th>WindowWidth_0</th>\n",
+       "      <th>WindowWidth_1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>16</td>\n",
+       "      <td>12</td>\n",
+       "      <td>512</td>\n",
+       "      <td>11</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.981627</td>\n",
+       "      <td>-0.190809</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>ID_4d28912ba6</td>\n",
+       "      <td></td>\n",
+       "      <td>ID_1f6d1e8aeb</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>40.0</td>\n",
+       "      <td>40.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>80.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>16</td>\n",
+       "      <td>16</td>\n",
+       "      <td>512</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.987688</td>\n",
+       "      <td>-0.156434</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>ID_acabdeee86</td>\n",
+       "      <td></td>\n",
+       "      <td>ID_4a8d7ec19f</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>16</td>\n",
+       "      <td>16</td>\n",
+       "      <td>512</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.927184</td>\n",
+       "      <td>-0.374607</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>ID_d00cee7f0c</td>\n",
+       "      <td></td>\n",
+       "      <td>ID_a6ca244172</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>16</td>\n",
+       "      <td>16</td>\n",
+       "      <td>512</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.986286</td>\n",
+       "      <td>-0.165048</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>ID_a52a0112d5</td>\n",
+       "      <td></td>\n",
+       "      <td>ID_fa950a03af</td>\n",
+       "      <td>30.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>16</td>\n",
+       "      <td>12</td>\n",
+       "      <td>512</td>\n",
+       "      <td>11</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>ID_f552d3b922</td>\n",
+       "      <td></td>\n",
+       "      <td>ID_965d8b3d8e</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>36.0</td>\n",
+       "      <td>36.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>80.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 33 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   BitsAllocated  BitsStored  Columns  HighBit  ImageOrientationPatient_0  \\\n",
+       "0             16          12      512       11                        1.0   \n",
+       "1             16          16      512       15                        1.0   \n",
+       "2             16          16      512       15                        1.0   \n",
+       "3             16          16      512       15                        1.0   \n",
+       "4             16          12      512       11                        1.0   \n",
+       "\n",
+       "   ImageOrientationPatient_1  ImageOrientationPatient_2  \\\n",
+       "0                        0.0                        0.0   \n",
+       "1                        0.0                        0.0   \n",
+       "2                        0.0                        0.0   \n",
+       "3                        0.0                        0.0   \n",
+       "4                        0.0                        0.0   \n",
+       "\n",
+       "   ImageOrientationPatient_3  ImageOrientationPatient_4  \\\n",
+       "0                        0.0                   0.981627   \n",
+       "1                        0.0                   0.987688   \n",
+       "2                        0.0                   0.927184   \n",
+       "3                        0.0                   0.986286   \n",
+       "4                        0.0                   1.000000   \n",
+       "\n",
+       "   ImageOrientationPatient_5  ...  SamplesPerPixel  SeriesInstanceUID  \\\n",
+       "0                  -0.190809  ...                1      ID_4d28912ba6   \n",
+       "1                  -0.156434  ...                1      ID_acabdeee86   \n",
+       "2                  -0.374607  ...                1      ID_d00cee7f0c   \n",
+       "3                  -0.165048  ...                1      ID_a52a0112d5   \n",
+       "4                   0.000000  ...                1      ID_f552d3b922   \n",
+       "\n",
+       "   StudyID StudyInstanceUID WindowCenter WindowCenter_0  WindowCenter_1  \\\n",
+       "0             ID_1f6d1e8aeb          NaN           40.0            40.0   \n",
+       "1             ID_4a8d7ec19f         30.0            NaN             NaN   \n",
+       "2             ID_a6ca244172         30.0            NaN             NaN   \n",
+       "3             ID_fa950a03af         30.0            NaN             NaN   \n",
+       "4             ID_965d8b3d8e          NaN           36.0            36.0   \n",
+       "\n",
+       "   WindowWidth  WindowWidth_0  WindowWidth_1  \n",
+       "0          NaN           80.0           80.0  \n",
+       "1         80.0            NaN            NaN  \n",
+       "2         80.0            NaN            NaN  \n",
+       "3         80.0            NaN            NaN  \n",
+       "4          NaN           80.0           80.0  \n",
+       "\n",
+       "[5 rows x 33 columns]"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_test_dicom = pd.DataFrame(l)\n",
+    "df_test_dicom.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_test_dicom.to_csv(f'data/{stage}_test_dicom.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Add labels to the train dicom csv\n",
+    "Generates:\n",
+    "* `train_dicom_diags.csv` (previously named `train_dicom_pivot.csv`)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_dicom_diags = pd.merge(df_train_dicom, df_diags,  how='left', left_on=['SOPInstanceUID'], right_on = ['fid'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert len(df_train_dicom) == len(df_diags) == len(df_train_dicom_diags)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train_dicom_diags.to_csv(f'data/{stage}_train_dicom_diags.csv')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}