1277 lines (1276 with data), 41.9 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from pathlib import Path\n",
"from collections import defaultdict\n",
"import pickle"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"stage = \"stage_2\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train dataset"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(f\"data/{stage}_train_dicom_diags.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>BitsAllocated</th>\n",
" <th>BitsStored</th>\n",
" <th>Columns</th>\n",
" <th>HighBit</th>\n",
" <th>ImageOrientationPatient_0</th>\n",
" <th>ImageOrientationPatient_1</th>\n",
" <th>ImageOrientationPatient_2</th>\n",
" <th>ImageOrientationPatient_3</th>\n",
" <th>ImageOrientationPatient_4</th>\n",
" <th>...</th>\n",
" <th>WindowWidth</th>\n",
" <th>WindowWidth_0</th>\n",
" <th>WindowWidth_1</th>\n",
" <th>fid</th>\n",
" <th>any</th>\n",
" <th>epidural</th>\n",
" <th>intraparenchymal</th>\n",
" <th>intraventricular</th>\n",
" <th>subarachnoid</th>\n",
" <th>subdural</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>512</td>\n",
" <td>15</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.927184</td>\n",
" <td>...</td>\n",
" <td>80.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>ID_000012eaf</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>512</td>\n",
" <td>15</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.968148</td>\n",
" <td>...</td>\n",
" <td>80.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>ID_000039fa0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>512</td>\n",
" <td>15</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.000000</td>\n",
" <td>...</td>\n",
" <td>100.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>ID_00005679d</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>16</td>\n",
" <td>12</td>\n",
" <td>512</td>\n",
" <td>11</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.994522</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>80.0</td>\n",
" <td>ID_00008ce3c</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>512</td>\n",
" <td>15</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.000000</td>\n",
" <td>...</td>\n",
" <td>135.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>ID_0000950d7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 41 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 BitsAllocated BitsStored Columns HighBit \\\n",
"0 0 16 16 512 15 \n",
"1 1 16 16 512 15 \n",
"2 2 16 16 512 15 \n",
"3 3 16 12 512 11 \n",
"4 4 16 16 512 15 \n",
"\n",
" ImageOrientationPatient_0 ImageOrientationPatient_1 \\\n",
"0 1.0 0.0 \n",
"1 1.0 0.0 \n",
"2 1.0 0.0 \n",
"3 1.0 0.0 \n",
"4 1.0 0.0 \n",
"\n",
" ImageOrientationPatient_2 ImageOrientationPatient_3 \\\n",
"0 0.0 0.0 \n",
"1 0.0 0.0 \n",
"2 0.0 0.0 \n",
"3 0.0 0.0 \n",
"4 0.0 0.0 \n",
"\n",
" ImageOrientationPatient_4 ... WindowWidth WindowWidth_0 WindowWidth_1 \\\n",
"0 0.927184 ... 80.0 NaN NaN \n",
"1 0.968148 ... 80.0 NaN NaN \n",
"2 1.000000 ... 100.0 NaN NaN \n",
"3 0.994522 ... NaN 80.0 80.0 \n",
"4 1.000000 ... 135.0 NaN NaN \n",
"\n",
" fid any epidural intraparenchymal intraventricular subarachnoid \\\n",
"0 ID_000012eaf 0 0 0 0 0 \n",
"1 ID_000039fa0 0 0 0 0 0 \n",
"2 ID_00005679d 0 0 0 0 0 \n",
"3 ID_00008ce3c 0 0 0 0 0 \n",
"4 ID_0000950d7 0 0 0 0 0 \n",
"\n",
" subdural \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
"[5 rows x 41 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"21744"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# sort, then group by (order is preserved within groups)\n",
"gs = df.sort_values('ImagePositionPatient_2').groupby('SeriesInstanceUID')\n",
"len(gs)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ImagePositionPatient_2</th>\n",
" <th>fid</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>577964</th>\n",
" <td>193.542489</td>\n",
" <td>ID_c45659d3d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>229790</th>\n",
" <td>198.214051</td>\n",
" <td>ID_4e0bdd2ba</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22395</th>\n",
" <td>202.885613</td>\n",
" <td>ID_079945c27</td>\n",
" </tr>\n",
" <tr>\n",
" <th>746126</th>\n",
" <td>207.557174</td>\n",
" <td>ID_fdbfb2c17</td>\n",
" </tr>\n",
" <tr>\n",
" <th>253266</th>\n",
" <td>212.228736</td>\n",
" <td>ID_55f7bbbf2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ImagePositionPatient_2 fid\n",
"577964 193.542489 ID_c45659d3d\n",
"229790 198.214051 ID_4e0bdd2ba\n",
"22395 202.885613 ID_079945c27\n",
"746126 207.557174 ID_fdbfb2c17\n",
"253266 212.228736 ID_55f7bbbf2"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# see if it worked\n",
"gs.get_group('ID_fa19cd5ea9')[['ImagePositionPatient_2', 'fid']].head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"g = gs.get_group('ID_fa19cd5ea9')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"subg = g[['SeriesInstanceUID', 'fid', 'any', 'epidural', \n",
" 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SeriesInstanceUID</th>\n",
" <th>fid</th>\n",
" <th>any</th>\n",
" <th>epidural</th>\n",
" <th>intraparenchymal</th>\n",
" <th>intraventricular</th>\n",
" <th>subarachnoid</th>\n",
" <th>subdural</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>577964</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_c45659d3d</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>229790</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_4e0bdd2ba</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22395</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_079945c27</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>746126</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_fdbfb2c17</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>253266</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_55f7bbbf2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>549211</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_ba7080372</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>592856</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_c964e4096</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>183149</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_3e31d57d0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>306771</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_680b2194c</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>540358</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_b76b13444</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>645217</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_db48a633d</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>270974</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_5bf2ca43f</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>672814</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_e4b636907</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>350834</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_7714ead69</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>749886</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_ff012ee5b</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>523978</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_b1cea5abb</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>464942</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_9dad2eb09</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>229881</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_4e14d0fe8</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>186237</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_3f422852d</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>599624</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_cbbb50e6d</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>347055</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_75cbdae68</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>359450</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_7a02fdbea</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>127205</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_2b3671dd9</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>148587</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_3274f5977</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>413641</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_8c5fc9e44</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>688538</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_ea2861e9a</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318670</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_6c19c9f7b</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>630472</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_d6435f3bf</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>202656</th>\n",
" <td>ID_fa19cd5ea9</td>\n",
" <td>ID_44d57858e</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" SeriesInstanceUID fid any epidural intraparenchymal \\\n",
"577964 ID_fa19cd5ea9 ID_c45659d3d 0 0 0 \n",
"229790 ID_fa19cd5ea9 ID_4e0bdd2ba 0 0 0 \n",
"22395 ID_fa19cd5ea9 ID_079945c27 1 0 0 \n",
"746126 ID_fa19cd5ea9 ID_fdbfb2c17 1 0 0 \n",
"253266 ID_fa19cd5ea9 ID_55f7bbbf2 1 0 0 \n",
"549211 ID_fa19cd5ea9 ID_ba7080372 1 0 0 \n",
"592856 ID_fa19cd5ea9 ID_c964e4096 1 0 0 \n",
"183149 ID_fa19cd5ea9 ID_3e31d57d0 1 0 0 \n",
"306771 ID_fa19cd5ea9 ID_680b2194c 1 0 0 \n",
"540358 ID_fa19cd5ea9 ID_b76b13444 1 0 0 \n",
"645217 ID_fa19cd5ea9 ID_db48a633d 1 0 0 \n",
"270974 ID_fa19cd5ea9 ID_5bf2ca43f 1 0 0 \n",
"672814 ID_fa19cd5ea9 ID_e4b636907 1 0 0 \n",
"350834 ID_fa19cd5ea9 ID_7714ead69 1 0 0 \n",
"749886 ID_fa19cd5ea9 ID_ff012ee5b 1 0 0 \n",
"523978 ID_fa19cd5ea9 ID_b1cea5abb 1 0 0 \n",
"464942 ID_fa19cd5ea9 ID_9dad2eb09 1 0 0 \n",
"229881 ID_fa19cd5ea9 ID_4e14d0fe8 1 0 0 \n",
"186237 ID_fa19cd5ea9 ID_3f422852d 1 0 0 \n",
"599624 ID_fa19cd5ea9 ID_cbbb50e6d 1 0 0 \n",
"347055 ID_fa19cd5ea9 ID_75cbdae68 1 0 0 \n",
"359450 ID_fa19cd5ea9 ID_7a02fdbea 1 0 0 \n",
"127205 ID_fa19cd5ea9 ID_2b3671dd9 1 0 0 \n",
"148587 ID_fa19cd5ea9 ID_3274f5977 0 0 0 \n",
"413641 ID_fa19cd5ea9 ID_8c5fc9e44 0 0 0 \n",
"688538 ID_fa19cd5ea9 ID_ea2861e9a 0 0 0 \n",
"318670 ID_fa19cd5ea9 ID_6c19c9f7b 0 0 0 \n",
"630472 ID_fa19cd5ea9 ID_d6435f3bf 0 0 0 \n",
"202656 ID_fa19cd5ea9 ID_44d57858e 0 0 0 \n",
"\n",
" intraventricular subarachnoid subdural \n",
"577964 0 0 0 \n",
"229790 0 0 0 \n",
"22395 0 0 1 \n",
"746126 0 0 1 \n",
"253266 0 0 1 \n",
"549211 0 0 1 \n",
"592856 0 0 1 \n",
"183149 0 0 1 \n",
"306771 0 0 1 \n",
"540358 0 0 1 \n",
"645217 0 0 1 \n",
"270974 0 0 1 \n",
"672814 0 0 1 \n",
"350834 0 0 1 \n",
"749886 0 0 1 \n",
"523978 0 0 1 \n",
"464942 0 0 1 \n",
"229881 0 0 1 \n",
"186237 0 0 1 \n",
"599624 0 0 1 \n",
"347055 0 0 1 \n",
"359450 0 0 1 \n",
"127205 0 0 1 \n",
"148587 0 0 0 \n",
"413641 0 0 0 \n",
"688538 0 0 0 \n",
"318670 0 0 0 \n",
"630472 0 0 0 \n",
"202656 0 0 0 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subg"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# You can use a btrfs snapshot and rename files by study_id and z-pos through the brain\n",
"def rename_train_group(subg):\n",
" ix = 0\n",
" total = len(subg)\n",
" for index, row in subg.iterrows():\n",
" cur_fn = row['fid']\n",
" new_fn = f\"{row['SeriesInstanceUID']}_{ix:03}_{total:03}_{row['any']}_{row['epidural']}_{row['intraparenchymal']}_{row['intraventricular']}_{row['subarachnoid']}_{row['subdural']}_{cur_fn}\"\n",
" ix += 1\n",
" Path(f'data/unzip_renamed/{stage}_train_images/{cur_fn}.dcm').rename(f'data/unzip_renamed/{stage}_train_images/{new_fn}.dcm')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def index_group(subg, study_ix_to_fn, fn_to_study_ix):\n",
" ix = 0\n",
" for index, row in subg.iterrows():\n",
" fn = row['SOPInstanceUID']\n",
" study = row['SeriesInstanceUID']\n",
" study_ix_to_fn[study].append(fn)\n",
" fn_to_study_ix[fn] = (study, ix)\n",
" ix += 1 "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"labels = [ 'any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural' ]\n",
"\n",
"def label_group(subg, fn_to_labels):\n",
" for index, row in subg.iterrows():\n",
" fn = row['SOPInstanceUID']\n",
" fn_to_labels[fn] = [ label for label in labels if row[label] == 1 ]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"train_study_ix_to_fn = defaultdict(list)\n",
"train_fn_to_study_ix = {}\n",
"train_fn_to_labels = {}\n",
"\n",
"for name, subg in gs:\n",
" #rename_train_group(subg)\n",
" index_group(subg, train_study_ix_to_fn, train_fn_to_study_ix)\n",
" label_group(subg, train_fn_to_labels)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# Do not pickle yet, merge with test\n",
"pickle.dump(train_study_ix_to_fn, open(f\"data/{stage}_train_study_ix_to_fn.pickle\", \"wb\" ))\n",
"pickle.dump(train_fn_to_study_ix, open(f\"data/{stage}_train_fn_to_study_ix.pickle\", \"wb\" ))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['any', 'subdural']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_fn_to_labels['ID_079945c27']"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"pickle.dump(train_fn_to_labels, open(f\"data/{stage}_train_fn_to_labels.pickle\", 'wb'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test dataset"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(f\"data/{stage}_test_dicom.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>BitsAllocated</th>\n",
" <th>BitsStored</th>\n",
" <th>Columns</th>\n",
" <th>HighBit</th>\n",
" <th>ImageOrientationPatient_0</th>\n",
" <th>ImageOrientationPatient_1</th>\n",
" <th>ImageOrientationPatient_2</th>\n",
" <th>ImageOrientationPatient_3</th>\n",
" <th>ImageOrientationPatient_4</th>\n",
" <th>...</th>\n",
" <th>SamplesPerPixel</th>\n",
" <th>SeriesInstanceUID</th>\n",
" <th>StudyID</th>\n",
" <th>StudyInstanceUID</th>\n",
" <th>WindowCenter</th>\n",
" <th>WindowCenter_0</th>\n",
" <th>WindowCenter_1</th>\n",
" <th>WindowWidth</th>\n",
" <th>WindowWidth_0</th>\n",
" <th>WindowWidth_1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>16</td>\n",
" <td>12</td>\n",
" <td>512</td>\n",
" <td>11</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.981627</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>ID_4d28912ba6</td>\n",
" <td>NaN</td>\n",
" <td>ID_1f6d1e8aeb</td>\n",
" <td>NaN</td>\n",
" <td>40.0</td>\n",
" <td>40.0</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>80.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>512</td>\n",
" <td>15</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.987688</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>ID_acabdeee86</td>\n",
" <td>NaN</td>\n",
" <td>ID_4a8d7ec19f</td>\n",
" <td>30.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>512</td>\n",
" <td>15</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.927184</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>ID_d00cee7f0c</td>\n",
" <td>NaN</td>\n",
" <td>ID_a6ca244172</td>\n",
" <td>30.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>512</td>\n",
" <td>15</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.986286</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>ID_a52a0112d5</td>\n",
" <td>NaN</td>\n",
" <td>ID_fa950a03af</td>\n",
" <td>30.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>16</td>\n",
" <td>12</td>\n",
" <td>512</td>\n",
" <td>11</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.000000</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>ID_f552d3b922</td>\n",
" <td>NaN</td>\n",
" <td>ID_965d8b3d8e</td>\n",
" <td>NaN</td>\n",
" <td>36.0</td>\n",
" <td>36.0</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>80.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 34 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 BitsAllocated BitsStored Columns HighBit \\\n",
"0 0 16 12 512 11 \n",
"1 1 16 16 512 15 \n",
"2 2 16 16 512 15 \n",
"3 3 16 16 512 15 \n",
"4 4 16 12 512 11 \n",
"\n",
" ImageOrientationPatient_0 ImageOrientationPatient_1 \\\n",
"0 1.0 0.0 \n",
"1 1.0 0.0 \n",
"2 1.0 0.0 \n",
"3 1.0 0.0 \n",
"4 1.0 0.0 \n",
"\n",
" ImageOrientationPatient_2 ImageOrientationPatient_3 \\\n",
"0 0.0 0.0 \n",
"1 0.0 0.0 \n",
"2 0.0 0.0 \n",
"3 0.0 0.0 \n",
"4 0.0 0.0 \n",
"\n",
" ImageOrientationPatient_4 ... SamplesPerPixel SeriesInstanceUID \\\n",
"0 0.981627 ... 1 ID_4d28912ba6 \n",
"1 0.987688 ... 1 ID_acabdeee86 \n",
"2 0.927184 ... 1 ID_d00cee7f0c \n",
"3 0.986286 ... 1 ID_a52a0112d5 \n",
"4 1.000000 ... 1 ID_f552d3b922 \n",
"\n",
" StudyID StudyInstanceUID WindowCenter WindowCenter_0 WindowCenter_1 \\\n",
"0 NaN ID_1f6d1e8aeb NaN 40.0 40.0 \n",
"1 NaN ID_4a8d7ec19f 30.0 NaN NaN \n",
"2 NaN ID_a6ca244172 30.0 NaN NaN \n",
"3 NaN ID_fa950a03af 30.0 NaN NaN \n",
"4 NaN ID_965d8b3d8e NaN 36.0 36.0 \n",
"\n",
" WindowWidth WindowWidth_0 WindowWidth_1 \n",
"0 NaN 80.0 80.0 \n",
"1 80.0 NaN NaN \n",
"2 80.0 NaN NaN \n",
"3 80.0 NaN NaN \n",
"4 NaN 80.0 80.0 \n",
"\n",
"[5 rows x 34 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3518"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# sort, then group by (order is preserver within groups)\n",
"gs = df.sort_values('ImagePositionPatient_2').groupby('SeriesInstanceUID')\n",
"len(gs)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def rename_test_group(subg):\n",
" ix = 0\n",
" total = len(subg)\n",
" for index, row in subg.iterrows():\n",
" cur_fn = row['SOPInstanceUID']\n",
" new_fn = f\"{row['SeriesInstanceUID']}_{ix:03}_{total:03}_{cur_fn}\"\n",
" ix += 1\n",
" Path(f'data/unzip_renamed/{stage}_test_images/{cur_fn}.dcm').rename(f'data/unzip_renamed/{stage}_test_images/{new_fn}.dcm')\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"test_study_ix_to_fn = defaultdict(list)\n",
"test_fn_to_study_ix = {}\n",
"\n",
"for name, subg in gs:\n",
" #rename_test_group(subg)\n",
" index_group(subg, test_study_ix_to_fn, test_fn_to_study_ix)\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"pickle.dump(test_study_ix_to_fn, open(f\"data/{stage}_test_study_ix_to_fn.pickle\", \"wb\" ))\n",
"pickle.dump(test_fn_to_study_ix, open(f\"data/{stage}_test_fn_to_study_ix.pickle\", \"wb\" ))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"study_ix_to_fn = { **train_study_ix_to_fn, **test_study_ix_to_fn }\n",
"fn_to_study_ix = { **train_fn_to_study_ix, **test_fn_to_study_ix }"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"pickle.dump(study_ix_to_fn, open(f\"data/{stage}_study_ix_to_fn.pickle\", \"wb\" ))\n",
"pickle.dump(fn_to_study_ix, open(f\"data/{stage}_fn_to_study_ix.pickle\", \"wb\" ))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}