--- a +++ b/2-preprocess-pickle.ipynb @@ -0,0 +1,1276 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pathlib import Path\n", + "from collections import defaultdict\n", + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "stage = \"stage_2\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(f\"data/{stage}_train_dicom_diags.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Unnamed: 0</th>\n", + " <th>BitsAllocated</th>\n", + " <th>BitsStored</th>\n", + " <th>Columns</th>\n", + " <th>HighBit</th>\n", + " <th>ImageOrientationPatient_0</th>\n", + " <th>ImageOrientationPatient_1</th>\n", + " <th>ImageOrientationPatient_2</th>\n", + " <th>ImageOrientationPatient_3</th>\n", + " <th>ImageOrientationPatient_4</th>\n", + " <th>...</th>\n", + " <th>WindowWidth</th>\n", + " <th>WindowWidth_0</th>\n", + " <th>WindowWidth_1</th>\n", + " <th>fid</th>\n", + " <th>any</th>\n", + " <th>epidural</th>\n", + " <th>intraparenchymal</th>\n", + " <th>intraventricular</th>\n", + " <th>subarachnoid</th>\n", + " <th>subdural</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0</td>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>512</td>\n", + " <td>15</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.927184</td>\n", + " <td>...</td>\n", + " <td>80.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>ID_000012eaf</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>512</td>\n", + " <td>15</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.968148</td>\n", + " <td>...</td>\n", + " <td>80.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>ID_000039fa0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2</td>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>512</td>\n", + " <td>15</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1.000000</td>\n", + " <td>...</td>\n", + " <td>100.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>ID_00005679d</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>3</td>\n", + " <td>16</td>\n", + " <td>12</td>\n", + " <td>512</td>\n", + " <td>11</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.994522</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>80.0</td>\n", + " <td>80.0</td>\n", + " <td>ID_00008ce3c</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>4</td>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>512</td>\n", + " <td>15</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1.000000</td>\n", + " <td>...</td>\n", + " <td>135.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>ID_0000950d7</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 41 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Unnamed: 0 BitsAllocated BitsStored Columns HighBit \\\n", + "0 0 16 16 512 15 \n", + "1 1 16 16 512 15 \n", + "2 2 16 16 512 15 \n", + "3 3 16 12 512 11 \n", + "4 4 16 16 512 15 \n", + "\n", + " ImageOrientationPatient_0 ImageOrientationPatient_1 \\\n", + "0 1.0 0.0 \n", + "1 1.0 0.0 \n", + "2 1.0 0.0 \n", + "3 1.0 0.0 \n", + "4 1.0 0.0 \n", + "\n", + " ImageOrientationPatient_2 ImageOrientationPatient_3 \\\n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "\n", + " ImageOrientationPatient_4 ... WindowWidth WindowWidth_0 WindowWidth_1 \\\n", + "0 0.927184 ... 80.0 NaN NaN \n", + "1 0.968148 ... 80.0 NaN NaN \n", + "2 1.000000 ... 100.0 NaN NaN \n", + "3 0.994522 ... NaN 80.0 80.0 \n", + "4 1.000000 ... 135.0 NaN NaN \n", + "\n", + " fid any epidural intraparenchymal intraventricular subarachnoid \\\n", + "0 ID_000012eaf 0 0 0 0 0 \n", + "1 ID_000039fa0 0 0 0 0 0 \n", + "2 ID_00005679d 0 0 0 0 0 \n", + "3 ID_00008ce3c 0 0 0 0 0 \n", + "4 ID_0000950d7 0 0 0 0 0 \n", + "\n", + " subdural \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + "[5 rows x 41 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "21744" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sort, then group by (order is preserved within groups)\n", + "gs = df.sort_values('ImagePositionPatient_2').groupby('SeriesInstanceUID')\n", + "len(gs)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ImagePositionPatient_2</th>\n", + " <th>fid</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>577964</th>\n", + " <td>193.542489</td>\n", + " <td>ID_c45659d3d</td>\n", + " </tr>\n", + " <tr>\n", + " <th>229790</th>\n", + " <td>198.214051</td>\n", + " <td>ID_4e0bdd2ba</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22395</th>\n", + " <td>202.885613</td>\n", + " <td>ID_079945c27</td>\n", + " </tr>\n", + " <tr>\n", + " <th>746126</th>\n", + " <td>207.557174</td>\n", + " <td>ID_fdbfb2c17</td>\n", + " </tr>\n", + " <tr>\n", + " <th>253266</th>\n", + " <td>212.228736</td>\n", + " <td>ID_55f7bbbf2</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ImagePositionPatient_2 fid\n", + "577964 193.542489 ID_c45659d3d\n", + "229790 198.214051 ID_4e0bdd2ba\n", + "22395 202.885613 ID_079945c27\n", + "746126 207.557174 ID_fdbfb2c17\n", + "253266 212.228736 ID_55f7bbbf2" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# see if it worked\n", + "gs.get_group('ID_fa19cd5ea9')[['ImagePositionPatient_2', 'fid']].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "g = gs.get_group('ID_fa19cd5ea9')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "subg = g[['SeriesInstanceUID', 'fid', 'any', 'epidural', \n", + " 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>SeriesInstanceUID</th>\n", + " <th>fid</th>\n", + " <th>any</th>\n", + " <th>epidural</th>\n", + " <th>intraparenchymal</th>\n", + " <th>intraventricular</th>\n", + " <th>subarachnoid</th>\n", + " <th>subdural</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>577964</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_c45659d3d</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>229790</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_4e0bdd2ba</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22395</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_079945c27</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>746126</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_fdbfb2c17</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>253266</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_55f7bbbf2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>549211</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_ba7080372</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>592856</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_c964e4096</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>183149</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_3e31d57d0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>306771</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_680b2194c</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>540358</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_b76b13444</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>645217</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_db48a633d</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>270974</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_5bf2ca43f</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>672814</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_e4b636907</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>350834</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_7714ead69</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>749886</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_ff012ee5b</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>523978</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_b1cea5abb</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>464942</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_9dad2eb09</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>229881</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_4e14d0fe8</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>186237</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_3f422852d</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>599624</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_cbbb50e6d</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>347055</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_75cbdae68</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>359450</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_7a02fdbea</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>127205</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_2b3671dd9</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>148587</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_3274f5977</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>413641</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_8c5fc9e44</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>688538</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_ea2861e9a</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>318670</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_6c19c9f7b</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>630472</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_d6435f3bf</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>202656</th>\n", + " <td>ID_fa19cd5ea9</td>\n", + " <td>ID_44d57858e</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " SeriesInstanceUID fid any epidural intraparenchymal \\\n", + "577964 ID_fa19cd5ea9 ID_c45659d3d 0 0 0 \n", + "229790 ID_fa19cd5ea9 ID_4e0bdd2ba 0 0 0 \n", + "22395 ID_fa19cd5ea9 ID_079945c27 1 0 0 \n", + "746126 ID_fa19cd5ea9 ID_fdbfb2c17 1 0 0 \n", + "253266 ID_fa19cd5ea9 ID_55f7bbbf2 1 0 0 \n", + "549211 ID_fa19cd5ea9 ID_ba7080372 1 0 0 \n", + "592856 ID_fa19cd5ea9 ID_c964e4096 1 0 0 \n", + "183149 ID_fa19cd5ea9 ID_3e31d57d0 1 0 0 \n", + "306771 ID_fa19cd5ea9 ID_680b2194c 1 0 0 \n", + "540358 ID_fa19cd5ea9 ID_b76b13444 1 0 0 \n", + "645217 ID_fa19cd5ea9 ID_db48a633d 1 0 0 \n", + "270974 ID_fa19cd5ea9 ID_5bf2ca43f 1 0 0 \n", + "672814 ID_fa19cd5ea9 ID_e4b636907 1 0 0 \n", + "350834 ID_fa19cd5ea9 ID_7714ead69 1 0 0 \n", + "749886 ID_fa19cd5ea9 ID_ff012ee5b 1 0 0 \n", + "523978 ID_fa19cd5ea9 ID_b1cea5abb 1 0 0 \n", + "464942 ID_fa19cd5ea9 ID_9dad2eb09 1 0 0 \n", + "229881 ID_fa19cd5ea9 ID_4e14d0fe8 1 0 0 \n", + "186237 ID_fa19cd5ea9 ID_3f422852d 1 0 0 \n", + "599624 ID_fa19cd5ea9 ID_cbbb50e6d 1 0 0 \n", + "347055 ID_fa19cd5ea9 ID_75cbdae68 1 0 0 \n", + "359450 ID_fa19cd5ea9 ID_7a02fdbea 1 0 0 \n", + "127205 ID_fa19cd5ea9 ID_2b3671dd9 1 0 0 \n", + "148587 ID_fa19cd5ea9 ID_3274f5977 0 0 0 \n", + "413641 ID_fa19cd5ea9 ID_8c5fc9e44 0 0 0 \n", + "688538 ID_fa19cd5ea9 ID_ea2861e9a 0 0 0 \n", + "318670 ID_fa19cd5ea9 ID_6c19c9f7b 0 0 0 \n", + "630472 ID_fa19cd5ea9 ID_d6435f3bf 0 0 0 \n", + "202656 ID_fa19cd5ea9 ID_44d57858e 0 0 0 \n", + "\n", + " intraventricular subarachnoid subdural \n", + "577964 0 0 0 \n", + "229790 0 0 0 \n", + "22395 0 0 1 \n", + "746126 0 0 1 \n", + "253266 0 0 1 \n", + "549211 0 0 1 \n", + "592856 0 0 1 \n", + "183149 0 0 1 \n", + "306771 0 0 1 \n", + "540358 0 0 1 \n", + "645217 0 0 1 \n", + "270974 0 0 1 \n", + "672814 0 0 1 \n", + "350834 0 0 1 \n", + "749886 0 0 1 \n", + "523978 0 0 1 \n", + "464942 0 0 1 \n", + "229881 0 0 1 \n", + "186237 0 0 1 \n", + "599624 0 0 1 \n", + "347055 0 0 1 \n", + "359450 0 0 1 \n", + "127205 0 0 1 \n", + "148587 0 0 0 \n", + "413641 0 0 0 \n", + "688538 0 0 0 \n", + "318670 0 0 0 \n", + "630472 0 0 0 \n", + "202656 0 0 0 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subg" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# You can use a btrfs snapshot and rename files by study_id and z-pos through the brain\n", + "def rename_train_group(subg):\n", + " ix = 0\n", + " total = len(subg)\n", + " for index, row in subg.iterrows():\n", + " cur_fn = row['fid']\n", + " new_fn = f\"{row['SeriesInstanceUID']}_{ix:03}_{total:03}_{row['any']}_{row['epidural']}_{row['intraparenchymal']}_{row['intraventricular']}_{row['subarachnoid']}_{row['subdural']}_{cur_fn}\"\n", + " ix += 1\n", + " Path(f'data/unzip_renamed/{stage}_train_images/{cur_fn}.dcm').rename(f'data/unzip_renamed/{stage}_train_images/{new_fn}.dcm')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def index_group(subg, study_ix_to_fn, fn_to_study_ix):\n", + " ix = 0\n", + " for index, row in subg.iterrows():\n", + " fn = row['SOPInstanceUID']\n", + " study = row['SeriesInstanceUID']\n", + " study_ix_to_fn[study].append(fn)\n", + " fn_to_study_ix[fn] = (study, ix)\n", + " ix += 1 " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "labels = [ 'any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural' ]\n", + "\n", + "def label_group(subg, fn_to_labels):\n", + " for index, row in subg.iterrows():\n", + " fn = row['SOPInstanceUID']\n", + " fn_to_labels[fn] = [ label for label in labels if row[label] == 1 ]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "train_study_ix_to_fn = defaultdict(list)\n", + "train_fn_to_study_ix = {}\n", + "train_fn_to_labels = {}\n", + "\n", + "for name, subg in gs:\n", + " #rename_train_group(subg)\n", + " index_group(subg, train_study_ix_to_fn, train_fn_to_study_ix)\n", + " label_group(subg, train_fn_to_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Do not pickle yet, merge with test\n", + "pickle.dump(train_study_ix_to_fn, open(f\"data/{stage}_train_study_ix_to_fn.pickle\", \"wb\" ))\n", + "pickle.dump(train_fn_to_study_ix, open(f\"data/{stage}_train_fn_to_study_ix.pickle\", \"wb\" ))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['any', 'subdural']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_fn_to_labels['ID_079945c27']" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "pickle.dump(train_fn_to_labels, open(f\"data/{stage}_train_fn_to_labels.pickle\", 'wb'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(f\"data/{stage}_test_dicom.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Unnamed: 0</th>\n", + " <th>BitsAllocated</th>\n", + " <th>BitsStored</th>\n", + " <th>Columns</th>\n", + " <th>HighBit</th>\n", + " <th>ImageOrientationPatient_0</th>\n", + " <th>ImageOrientationPatient_1</th>\n", + " <th>ImageOrientationPatient_2</th>\n", + " <th>ImageOrientationPatient_3</th>\n", + " <th>ImageOrientationPatient_4</th>\n", + " <th>...</th>\n", + " <th>SamplesPerPixel</th>\n", + " <th>SeriesInstanceUID</th>\n", + " <th>StudyID</th>\n", + " <th>StudyInstanceUID</th>\n", + " <th>WindowCenter</th>\n", + " <th>WindowCenter_0</th>\n", + " <th>WindowCenter_1</th>\n", + " <th>WindowWidth</th>\n", + " <th>WindowWidth_0</th>\n", + " <th>WindowWidth_1</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0</td>\n", + " <td>16</td>\n", + " <td>12</td>\n", + " <td>512</td>\n", + " <td>11</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.981627</td>\n", + " <td>...</td>\n", + " <td>1</td>\n", + " <td>ID_4d28912ba6</td>\n", + " <td>NaN</td>\n", + " <td>ID_1f6d1e8aeb</td>\n", + " <td>NaN</td>\n", + " <td>40.0</td>\n", + " <td>40.0</td>\n", + " <td>NaN</td>\n", + " <td>80.0</td>\n", + " <td>80.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>512</td>\n", + " <td>15</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.987688</td>\n", + " <td>...</td>\n", + " <td>1</td>\n", + " <td>ID_acabdeee86</td>\n", + " <td>NaN</td>\n", + " <td>ID_4a8d7ec19f</td>\n", + " <td>30.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>80.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2</td>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>512</td>\n", + " <td>15</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.927184</td>\n", + " <td>...</td>\n", + " <td>1</td>\n", + " <td>ID_d00cee7f0c</td>\n", + " <td>NaN</td>\n", + " <td>ID_a6ca244172</td>\n", + " <td>30.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>80.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>3</td>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>512</td>\n", + " <td>15</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.986286</td>\n", + " <td>...</td>\n", + " <td>1</td>\n", + " <td>ID_a52a0112d5</td>\n", + " <td>NaN</td>\n", + " <td>ID_fa950a03af</td>\n", + " <td>30.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>80.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>4</td>\n", + " <td>16</td>\n", + " <td>12</td>\n", + " <td>512</td>\n", + " <td>11</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1.000000</td>\n", + " <td>...</td>\n", + " <td>1</td>\n", + " <td>ID_f552d3b922</td>\n", + " <td>NaN</td>\n", + " <td>ID_965d8b3d8e</td>\n", + " <td>NaN</td>\n", + " <td>36.0</td>\n", + " <td>36.0</td>\n", + " <td>NaN</td>\n", + " <td>80.0</td>\n", + " <td>80.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 34 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Unnamed: 0 BitsAllocated BitsStored Columns HighBit \\\n", + "0 0 16 12 512 11 \n", + "1 1 16 16 512 15 \n", + "2 2 16 16 512 15 \n", + "3 3 16 16 512 15 \n", + "4 4 16 12 512 11 \n", + "\n", + " ImageOrientationPatient_0 ImageOrientationPatient_1 \\\n", + "0 1.0 0.0 \n", + "1 1.0 0.0 \n", + "2 1.0 0.0 \n", + "3 1.0 0.0 \n", + "4 1.0 0.0 \n", + "\n", + " ImageOrientationPatient_2 ImageOrientationPatient_3 \\\n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "\n", + " ImageOrientationPatient_4 ... SamplesPerPixel SeriesInstanceUID \\\n", + "0 0.981627 ... 1 ID_4d28912ba6 \n", + "1 0.987688 ... 1 ID_acabdeee86 \n", + "2 0.927184 ... 1 ID_d00cee7f0c \n", + "3 0.986286 ... 1 ID_a52a0112d5 \n", + "4 1.000000 ... 1 ID_f552d3b922 \n", + "\n", + " StudyID StudyInstanceUID WindowCenter WindowCenter_0 WindowCenter_1 \\\n", + "0 NaN ID_1f6d1e8aeb NaN 40.0 40.0 \n", + "1 NaN ID_4a8d7ec19f 30.0 NaN NaN \n", + "2 NaN ID_a6ca244172 30.0 NaN NaN \n", + "3 NaN ID_fa950a03af 30.0 NaN NaN \n", + "4 NaN ID_965d8b3d8e NaN 36.0 36.0 \n", + "\n", + " WindowWidth WindowWidth_0 WindowWidth_1 \n", + "0 NaN 80.0 80.0 \n", + "1 80.0 NaN NaN \n", + "2 80.0 NaN NaN \n", + "3 80.0 NaN NaN \n", + "4 NaN 80.0 80.0 \n", + "\n", + "[5 rows x 34 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3518" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sort, then group by (order is preserver within groups)\n", + "gs = df.sort_values('ImagePositionPatient_2').groupby('SeriesInstanceUID')\n", + "len(gs)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "def rename_test_group(subg):\n", + " ix = 0\n", + " total = len(subg)\n", + " for index, row in subg.iterrows():\n", + " cur_fn = row['SOPInstanceUID']\n", + " new_fn = f\"{row['SeriesInstanceUID']}_{ix:03}_{total:03}_{cur_fn}\"\n", + " ix += 1\n", + " Path(f'data/unzip_renamed/{stage}_test_images/{cur_fn}.dcm').rename(f'data/unzip_renamed/{stage}_test_images/{new_fn}.dcm')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "test_study_ix_to_fn = defaultdict(list)\n", + "test_fn_to_study_ix = {}\n", + "\n", + "for name, subg in gs:\n", + " #rename_test_group(subg)\n", + " index_group(subg, test_study_ix_to_fn, test_fn_to_study_ix)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "pickle.dump(test_study_ix_to_fn, open(f\"data/{stage}_test_study_ix_to_fn.pickle\", \"wb\" ))\n", + "pickle.dump(test_fn_to_study_ix, open(f\"data/{stage}_test_fn_to_study_ix.pickle\", \"wb\" ))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "study_ix_to_fn = { **train_study_ix_to_fn, **test_study_ix_to_fn }\n", + "fn_to_study_ix = { **train_fn_to_study_ix, **test_fn_to_study_ix }" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "pickle.dump(study_ix_to_fn, open(f\"data/{stage}_study_ix_to_fn.pickle\", \"wb\" ))\n", + "pickle.dump(fn_to_study_ix, open(f\"data/{stage}_fn_to_study_ix.pickle\", \"wb\" ))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}