[fc9ccf]: / 0-preprocess-generate_csvs.ipynb

Download this file

1597 lines (1596 with data), 45.9 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "from tqdm import tqdm_notebook\n",
    "import pydicom\n",
    "import itertools\n",
    "import numpy as np\n",
    "from concurrent.futures import ProcessPoolExecutor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%capture\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "tqdm().pandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Read stage_X_train and split id/label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "stage = \"stage_2\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "bad_dcm_fn = f'data/unzip/{stage}_train_images/ID_6431af929.dcm'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rm: cannot remove 'data/unzip/stage_2_train_images/ID_6431af929.dcm': No such file or directory\r\n"
     ]
    }
   ],
   "source": [
    "!rm {bad_dcm_fn}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv(f'data/unzip/{stage}_train.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ID_12cadc6af_epidural</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ID_12cadc6af_intraparenchymal</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ID_12cadc6af_intraventricular</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ID_12cadc6af_subarachnoid</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ID_12cadc6af_subdural</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                              ID  Label\n",
       "0          ID_12cadc6af_epidural      0\n",
       "1  ID_12cadc6af_intraparenchymal      0\n",
       "2  ID_12cadc6af_intraventricular      0\n",
       "3      ID_12cadc6af_subarachnoid      0\n",
       "4          ID_12cadc6af_subdural      0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train['fid'] = df_train.ID.apply(lambda x: '_'.join(x.split('_')[:2]) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train.columns = ['ID', 'probability', 'fid']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train['label'] = df_train.ID.apply(lambda x: x.split('_')[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>probability</th>\n",
       "      <th>fid</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ID_12cadc6af_epidural</td>\n",
       "      <td>0</td>\n",
       "      <td>ID_12cadc6af</td>\n",
       "      <td>epidural</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ID_12cadc6af_intraparenchymal</td>\n",
       "      <td>0</td>\n",
       "      <td>ID_12cadc6af</td>\n",
       "      <td>intraparenchymal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ID_12cadc6af_intraventricular</td>\n",
       "      <td>0</td>\n",
       "      <td>ID_12cadc6af</td>\n",
       "      <td>intraventricular</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ID_12cadc6af_subarachnoid</td>\n",
       "      <td>0</td>\n",
       "      <td>ID_12cadc6af</td>\n",
       "      <td>subarachnoid</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ID_12cadc6af_subdural</td>\n",
       "      <td>0</td>\n",
       "      <td>ID_12cadc6af</td>\n",
       "      <td>subdural</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                              ID  probability           fid             label\n",
       "0          ID_12cadc6af_epidural            0  ID_12cadc6af          epidural\n",
       "1  ID_12cadc6af_intraparenchymal            0  ID_12cadc6af  intraparenchymal\n",
       "2  ID_12cadc6af_intraventricular            0  ID_12cadc6af  intraventricular\n",
       "3      ID_12cadc6af_subarachnoid            0  ID_12cadc6af      subarachnoid\n",
       "4          ID_12cadc6af_subdural            0  ID_12cadc6af          subdural"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Remove dupes "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4516842, 4)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train.drop_duplicates('ID', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4516818, 4)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Remove corrupted image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = df_train[df_train.fid != 'ID_6431af929'] # ID_6431af929"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4516812, 4)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create pivot table with diagnostic labels as columns\n",
    "Generates:\n",
    "* `train_diags.csv` (previously named `train_pivot.csv`)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_diags = df_train.pivot(index='fid', columns='label', values='probability')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>label</th>\n",
       "      <th>any</th>\n",
       "      <th>epidural</th>\n",
       "      <th>intraparenchymal</th>\n",
       "      <th>intraventricular</th>\n",
       "      <th>subarachnoid</th>\n",
       "      <th>subdural</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fid</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>ID_000012eaf</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ID_000039fa0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ID_00005679d</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ID_00008ce3c</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ID_0000950d7</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "label         any  epidural  intraparenchymal  intraventricular  subarachnoid  \\\n",
       "fid                                                                             \n",
       "ID_000012eaf    0         0                 0                 0             0   \n",
       "ID_000039fa0    0         0                 0                 0             0   \n",
       "ID_00005679d    0         0                 0                 0             0   \n",
       "ID_00008ce3c    0         0                 0                 0             0   \n",
       "ID_0000950d7    0         0                 0                 0             0   \n",
       "\n",
       "label         subdural  \n",
       "fid                     \n",
       "ID_000012eaf         0  \n",
       "ID_000039fa0         0  \n",
       "ID_00005679d         0  \n",
       "ID_00008ce3c         0  \n",
       "ID_0000950d7         0  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_diags.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(752802, 6)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_diags.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_diags.reset_index(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>label</th>\n",
       "      <th>fid</th>\n",
       "      <th>any</th>\n",
       "      <th>epidural</th>\n",
       "      <th>intraparenchymal</th>\n",
       "      <th>intraventricular</th>\n",
       "      <th>subarachnoid</th>\n",
       "      <th>subdural</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ID_000012eaf</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ID_000039fa0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ID_00005679d</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ID_00008ce3c</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ID_0000950d7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "label           fid  any  epidural  intraparenchymal  intraventricular  \\\n",
       "0      ID_000012eaf    0         0                 0                 0   \n",
       "1      ID_000039fa0    0         0                 0                 0   \n",
       "2      ID_00005679d    0         0                 0                 0   \n",
       "3      ID_00008ce3c    0         0                 0                 0   \n",
       "4      ID_0000950d7    0         0                 0                 0   \n",
       "\n",
       "label  subarachnoid  subdural  \n",
       "0                 0         0  \n",
       "1                 0         0  \n",
       "2                 0         0  \n",
       "3                 0         0  \n",
       "4                 0         0  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_diags.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(752802, 7)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_diags.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_diags.to_csv(f'data/{stage}_train_diags.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate fastai-ready csv image file (.png) -> labels\n",
    "This is needed for early experiments that worked with the .png dataset.\n",
    "\n",
    "Generates:\n",
    "* `train_labels_as_strings.csv`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "d = defaultdict(list)\n",
    "for fid in df_train.fid.unique(): d[fid]\n",
    "\n",
    "for tup in df_train.itertuples():\n",
    "    if tup.probability: d[tup.fid].append(tup.label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "ks, vs = [], []\n",
    "\n",
    "for k, v in d.items():\n",
    "    ks.append(k), vs.append(' '.join(v))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "fastai_df = pd.DataFrame(data={'fn': ks, 'labels': vs})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(752802, 2)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fastai_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "fastai_df['fn'] += '.png'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fn</th>\n",
       "      <th>labels</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ID_12cadc6af.png</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ID_38fd7baa0.png</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ID_6c5d82413.png</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ID_aec8e68b3.png</td>\n",
       "      <td>subarachnoid any</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ID_4d9209c7c.png</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 fn            labels\n",
       "0  ID_12cadc6af.png                  \n",
       "1  ID_38fd7baa0.png                  \n",
       "2  ID_6c5d82413.png                  \n",
       "3  ID_aec8e68b3.png  subarachnoid any\n",
       "4  ID_4d9209c7c.png                  "
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fastai_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "fastai_df.to_csv(f'data/{stage}_train_labels_as_strings.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tabulate dicom data\n",
    "Generates:\n",
    "* `train_dicom.csv`\n",
    "* `test_dicom.csv`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "cols_i_want = ['BitsAllocated', 'BitsStored', 'Columns', 'HighBit', 'ImageOrientationPatient', \n",
    "               'ImagePositionPatient', 'Modality', 'PatientID', 'PhotometricInterpretation', \n",
    "               'PixelRepresentation', 'PixelSpacing', 'RescaleIntercept', 'RescaleSlope', \n",
    "               'Rows', 'SOPInstanceUID', 'SamplesPerPixel', 'SeriesInstanceUID', 'StudyID', \n",
    "               'StudyInstanceUID', 'WindowCenter', 'WindowWidth']\n",
    "useless_cols = [ 'PixelData' ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dcm_list = list(Path(f'data/unzip/{stage}_train_images').glob('*.dcm'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "f = train_dcm_list[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0008, 0018) SOP Instance UID                    UI: ID_000012eaf\n",
       "(0008, 0060) Modality                            CS: 'CT'\n",
       "(0010, 0020) Patient ID                          LO: 'ID_f15c0eee'\n",
       "(0020, 000d) Study Instance UID                  UI: ID_30ea2b02d4\n",
       "(0020, 000e) Series Instance UID                 UI: ID_0ab5820b2a\n",
       "(0020, 0010) Study ID                            SH: ''\n",
       "(0020, 0032) Image Position (Patient)            DS: ['-125.000000', '-115.897980', '77.970825']\n",
       "(0020, 0037) Image Orientation (Patient)         DS: ['1.000000', '0.000000', '0.000000', '0.000000', '0.927184', '-0.374607']\n",
       "(0028, 0002) Samples per Pixel                   US: 1\n",
       "(0028, 0004) Photometric Interpretation          CS: 'MONOCHROME2'\n",
       "(0028, 0010) Rows                                US: 512\n",
       "(0028, 0011) Columns                             US: 512\n",
       "(0028, 0030) Pixel Spacing                       DS: ['0.488281', '0.488281']\n",
       "(0028, 0100) Bits Allocated                      US: 16\n",
       "(0028, 0101) Bits Stored                         US: 16\n",
       "(0028, 0102) High Bit                            US: 15\n",
       "(0028, 0103) Pixel Representation                US: 1\n",
       "(0028, 1050) Window Center                       DS: \"30\"\n",
       "(0028, 1051) Window Width                        DS: \"80\"\n",
       "(0028, 1052) Rescale Intercept                   DS: \"-1024\"\n",
       "(0028, 1053) Rescale Slope                       DS: \"1\"\n",
       "(7fe0, 0010) Pixel Data                          OW: Array of 524288 elements"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dicom = pydicom.dcmread(str(f))\n",
    "dicom"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'ID_f15c0eee'"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dicom.data_element('PatientID').value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['-125.000000', '-115.897980', '77.970825']"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ipp = dicom.data_element('ImagePositionPatient').value\n",
    "ipp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"-125.000000\""
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ipp[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pydicom.multival.MultiValue"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(ipp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tabulate_dicom_data(file_list):\n",
    "    l = []\n",
    "    for f in file_list:\n",
    "        dicom = pydicom.dcmread(str(f))\n",
    "        d = {}\n",
    "        for s in cols_i_want:\n",
    "            v = dicom.data_element(s).value\n",
    "            if isinstance(v, pydicom.multival.MultiValue):\n",
    "                for i in range(len(v)):\n",
    "                    d[s + '_' + str(i)] = v[i]\n",
    "            else:\n",
    "                d[s] = v\n",
    "        l.append(d)\n",
    "        \n",
    "    return l\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "with ProcessPoolExecutor(max_workers=32) as e:\n",
    "     l = list(itertools.chain.from_iterable(e.map(tabulate_dicom_data, np.array_split(train_dcm_list, 32))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train_dicom = pd.DataFrame(l)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>BitsAllocated</th>\n",
       "      <th>BitsStored</th>\n",
       "      <th>Columns</th>\n",
       "      <th>HighBit</th>\n",
       "      <th>ImageOrientationPatient_0</th>\n",
       "      <th>ImageOrientationPatient_1</th>\n",
       "      <th>ImageOrientationPatient_2</th>\n",
       "      <th>ImageOrientationPatient_3</th>\n",
       "      <th>ImageOrientationPatient_4</th>\n",
       "      <th>ImageOrientationPatient_5</th>\n",
       "      <th>...</th>\n",
       "      <th>SamplesPerPixel</th>\n",
       "      <th>SeriesInstanceUID</th>\n",
       "      <th>StudyID</th>\n",
       "      <th>StudyInstanceUID</th>\n",
       "      <th>WindowCenter</th>\n",
       "      <th>WindowCenter_0</th>\n",
       "      <th>WindowCenter_1</th>\n",
       "      <th>WindowWidth</th>\n",
       "      <th>WindowWidth_0</th>\n",
       "      <th>WindowWidth_1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>16</td>\n",
       "      <td>16</td>\n",
       "      <td>512</td>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.927184</td>\n",
       "      <td>-0.374607</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>ID_0ab5820b2a</td>\n",
       "      <td></td>\n",
       "      <td>ID_30ea2b02d4</td>\n",
       "      <td>30.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>16</td>\n",
       "      <td>16</td>\n",
       "      <td>512</td>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.968148</td>\n",
       "      <td>-0.250380</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>ID_5f8484c3e0</td>\n",
       "      <td></td>\n",
       "      <td>ID_134d398b61</td>\n",
       "      <td>30.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>16</td>\n",
       "      <td>16</td>\n",
       "      <td>512</td>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>ID_203cd6ec46</td>\n",
       "      <td></td>\n",
       "      <td>ID_b5c26cda09</td>\n",
       "      <td>50.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>16</td>\n",
       "      <td>12</td>\n",
       "      <td>512</td>\n",
       "      <td>11</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.994522</td>\n",
       "      <td>0.104528</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>ID_3780d48b28</td>\n",
       "      <td></td>\n",
       "      <td>ID_974735bf79</td>\n",
       "      <td>NaN</td>\n",
       "      <td>40.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "      <td>80.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>16</td>\n",
       "      <td>16</td>\n",
       "      <td>512</td>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>ID_84296c3845</td>\n",
       "      <td></td>\n",
       "      <td>ID_8881b1c4b1</td>\n",
       "      <td>35.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>135.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 33 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   BitsAllocated  BitsStored  Columns  HighBit  ImageOrientationPatient_0  \\\n",
       "0             16          16      512       15                        1.0   \n",
       "1             16          16      512       15                        1.0   \n",
       "2             16          16      512       15                        1.0   \n",
       "3             16          12      512       11                        1.0   \n",
       "4             16          16      512       15                        1.0   \n",
       "\n",
       "   ImageOrientationPatient_1  ImageOrientationPatient_2  \\\n",
       "0                        0.0                        0.0   \n",
       "1                        0.0                        0.0   \n",
       "2                        0.0                        0.0   \n",
       "3                        0.0                        0.0   \n",
       "4                        0.0                        0.0   \n",
       "\n",
       "   ImageOrientationPatient_3  ImageOrientationPatient_4  \\\n",
       "0                        0.0                   0.927184   \n",
       "1                        0.0                   0.968148   \n",
       "2                        0.0                   1.000000   \n",
       "3                        0.0                   0.994522   \n",
       "4                        0.0                   1.000000   \n",
       "\n",
       "   ImageOrientationPatient_5  ...  SamplesPerPixel  SeriesInstanceUID  \\\n",
       "0                  -0.374607  ...                1      ID_0ab5820b2a   \n",
       "1                  -0.250380  ...                1      ID_5f8484c3e0   \n",
       "2                   0.000000  ...                1      ID_203cd6ec46   \n",
       "3                   0.104528  ...                1      ID_3780d48b28   \n",
       "4                   0.000000  ...                1      ID_84296c3845   \n",
       "\n",
       "   StudyID StudyInstanceUID WindowCenter WindowCenter_0  WindowCenter_1  \\\n",
       "0             ID_30ea2b02d4         30.0            NaN             NaN   \n",
       "1             ID_134d398b61         30.0            NaN             NaN   \n",
       "2             ID_b5c26cda09         50.0            NaN             NaN   \n",
       "3             ID_974735bf79          NaN           40.0            40.0   \n",
       "4             ID_8881b1c4b1         35.0            NaN             NaN   \n",
       "\n",
       "   WindowWidth  WindowWidth_0  WindowWidth_1  \n",
       "0         80.0            NaN            NaN  \n",
       "1         80.0            NaN            NaN  \n",
       "2        100.0            NaN            NaN  \n",
       "3          NaN           80.0           80.0  \n",
       "4        135.0            NaN            NaN  \n",
       "\n",
       "[5 rows x 33 columns]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train_dicom.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train_dicom.to_csv(f'data/{stage}_train_dicom.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_dcm_list = list(Path(f'data/unzip/{stage}_test_images').glob('*.dcm'))\n",
    "with ProcessPoolExecutor(max_workers=32) as e:\n",
    "     l = list(itertools.chain.from_iterable(e.map(tabulate_dicom_data, np.array_split(test_dcm_list, 32))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>BitsAllocated</th>\n",
       "      <th>BitsStored</th>\n",
       "      <th>Columns</th>\n",
       "      <th>HighBit</th>\n",
       "      <th>ImageOrientationPatient_0</th>\n",
       "      <th>ImageOrientationPatient_1</th>\n",
       "      <th>ImageOrientationPatient_2</th>\n",
       "      <th>ImageOrientationPatient_3</th>\n",
       "      <th>ImageOrientationPatient_4</th>\n",
       "      <th>ImageOrientationPatient_5</th>\n",
       "      <th>...</th>\n",
       "      <th>SamplesPerPixel</th>\n",
       "      <th>SeriesInstanceUID</th>\n",
       "      <th>StudyID</th>\n",
       "      <th>StudyInstanceUID</th>\n",
       "      <th>WindowCenter</th>\n",
       "      <th>WindowCenter_0</th>\n",
       "      <th>WindowCenter_1</th>\n",
       "      <th>WindowWidth</th>\n",
       "      <th>WindowWidth_0</th>\n",
       "      <th>WindowWidth_1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>16</td>\n",
       "      <td>12</td>\n",
       "      <td>512</td>\n",
       "      <td>11</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.981627</td>\n",
       "      <td>-0.190809</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>ID_4d28912ba6</td>\n",
       "      <td></td>\n",
       "      <td>ID_1f6d1e8aeb</td>\n",
       "      <td>NaN</td>\n",
       "      <td>40.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "      <td>80.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>16</td>\n",
       "      <td>16</td>\n",
       "      <td>512</td>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.987688</td>\n",
       "      <td>-0.156434</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>ID_acabdeee86</td>\n",
       "      <td></td>\n",
       "      <td>ID_4a8d7ec19f</td>\n",
       "      <td>30.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>16</td>\n",
       "      <td>16</td>\n",
       "      <td>512</td>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.927184</td>\n",
       "      <td>-0.374607</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>ID_d00cee7f0c</td>\n",
       "      <td></td>\n",
       "      <td>ID_a6ca244172</td>\n",
       "      <td>30.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>16</td>\n",
       "      <td>16</td>\n",
       "      <td>512</td>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.986286</td>\n",
       "      <td>-0.165048</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>ID_a52a0112d5</td>\n",
       "      <td></td>\n",
       "      <td>ID_fa950a03af</td>\n",
       "      <td>30.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>16</td>\n",
       "      <td>12</td>\n",
       "      <td>512</td>\n",
       "      <td>11</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>ID_f552d3b922</td>\n",
       "      <td></td>\n",
       "      <td>ID_965d8b3d8e</td>\n",
       "      <td>NaN</td>\n",
       "      <td>36.0</td>\n",
       "      <td>36.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "      <td>80.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 33 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   BitsAllocated  BitsStored  Columns  HighBit  ImageOrientationPatient_0  \\\n",
       "0             16          12      512       11                        1.0   \n",
       "1             16          16      512       15                        1.0   \n",
       "2             16          16      512       15                        1.0   \n",
       "3             16          16      512       15                        1.0   \n",
       "4             16          12      512       11                        1.0   \n",
       "\n",
       "   ImageOrientationPatient_1  ImageOrientationPatient_2  \\\n",
       "0                        0.0                        0.0   \n",
       "1                        0.0                        0.0   \n",
       "2                        0.0                        0.0   \n",
       "3                        0.0                        0.0   \n",
       "4                        0.0                        0.0   \n",
       "\n",
       "   ImageOrientationPatient_3  ImageOrientationPatient_4  \\\n",
       "0                        0.0                   0.981627   \n",
       "1                        0.0                   0.987688   \n",
       "2                        0.0                   0.927184   \n",
       "3                        0.0                   0.986286   \n",
       "4                        0.0                   1.000000   \n",
       "\n",
       "   ImageOrientationPatient_5  ...  SamplesPerPixel  SeriesInstanceUID  \\\n",
       "0                  -0.190809  ...                1      ID_4d28912ba6   \n",
       "1                  -0.156434  ...                1      ID_acabdeee86   \n",
       "2                  -0.374607  ...                1      ID_d00cee7f0c   \n",
       "3                  -0.165048  ...                1      ID_a52a0112d5   \n",
       "4                   0.000000  ...                1      ID_f552d3b922   \n",
       "\n",
       "   StudyID StudyInstanceUID WindowCenter WindowCenter_0  WindowCenter_1  \\\n",
       "0             ID_1f6d1e8aeb          NaN           40.0            40.0   \n",
       "1             ID_4a8d7ec19f         30.0            NaN             NaN   \n",
       "2             ID_a6ca244172         30.0            NaN             NaN   \n",
       "3             ID_fa950a03af         30.0            NaN             NaN   \n",
       "4             ID_965d8b3d8e          NaN           36.0            36.0   \n",
       "\n",
       "   WindowWidth  WindowWidth_0  WindowWidth_1  \n",
       "0          NaN           80.0           80.0  \n",
       "1         80.0            NaN            NaN  \n",
       "2         80.0            NaN            NaN  \n",
       "3         80.0            NaN            NaN  \n",
       "4          NaN           80.0           80.0  \n",
       "\n",
       "[5 rows x 33 columns]"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test_dicom = pd.DataFrame(l)\n",
    "df_test_dicom.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test_dicom.to_csv(f'data/{stage}_test_dicom.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Add labels to the train dicom csv\n",
    "Generates:\n",
    "* `train_dicom_diags.csv` (previously named `train_dicom_pivot.csv`)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train_dicom_diags = pd.merge(df_train_dicom, df_diags,  how='left', left_on=['SOPInstanceUID'], right_on = ['fid'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert len(df_train_dicom) == len(df_diags) == len(df_train_dicom_diags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train_dicom_diags.to_csv(f'data/{stage}_train_dicom_diags.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}