1597 lines (1596 with data), 45.9 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from pathlib import Path\n",
"from tqdm import tqdm_notebook\n",
"import pydicom\n",
"import itertools\n",
"import numpy as np\n",
"from concurrent.futures import ProcessPoolExecutor"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"from tqdm import tqdm_notebook as tqdm\n",
"tqdm().pandas()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Read stage_X_train and split id/label"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"stage = \"stage_2\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"bad_dcm_fn = f'data/unzip/{stage}_train_images/ID_6431af929.dcm'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rm: cannot remove 'data/unzip/stage_2_train_images/ID_6431af929.dcm': No such file or directory\r\n"
]
}
],
"source": [
"!rm {bad_dcm_fn}"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"df_train = pd.read_csv(f'data/unzip/{stage}_train.csv')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>Label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ID_12cadc6af_epidural</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ID_12cadc6af_intraparenchymal</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ID_12cadc6af_intraventricular</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ID_12cadc6af_subarachnoid</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ID_12cadc6af_subdural</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID Label\n",
"0 ID_12cadc6af_epidural 0\n",
"1 ID_12cadc6af_intraparenchymal 0\n",
"2 ID_12cadc6af_intraventricular 0\n",
"3 ID_12cadc6af_subarachnoid 0\n",
"4 ID_12cadc6af_subdural 0"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df_train['fid'] = df_train.ID.apply(lambda x: '_'.join(x.split('_')[:2]) )"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"df_train.columns = ['ID', 'probability', 'fid']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"df_train['label'] = df_train.ID.apply(lambda x: x.split('_')[-1])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>probability</th>\n",
" <th>fid</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ID_12cadc6af_epidural</td>\n",
" <td>0</td>\n",
" <td>ID_12cadc6af</td>\n",
" <td>epidural</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ID_12cadc6af_intraparenchymal</td>\n",
" <td>0</td>\n",
" <td>ID_12cadc6af</td>\n",
" <td>intraparenchymal</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ID_12cadc6af_intraventricular</td>\n",
" <td>0</td>\n",
" <td>ID_12cadc6af</td>\n",
" <td>intraventricular</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ID_12cadc6af_subarachnoid</td>\n",
" <td>0</td>\n",
" <td>ID_12cadc6af</td>\n",
" <td>subarachnoid</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ID_12cadc6af_subdural</td>\n",
" <td>0</td>\n",
" <td>ID_12cadc6af</td>\n",
" <td>subdural</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID probability fid label\n",
"0 ID_12cadc6af_epidural 0 ID_12cadc6af epidural\n",
"1 ID_12cadc6af_intraparenchymal 0 ID_12cadc6af intraparenchymal\n",
"2 ID_12cadc6af_intraventricular 0 ID_12cadc6af intraventricular\n",
"3 ID_12cadc6af_subarachnoid 0 ID_12cadc6af subarachnoid\n",
"4 ID_12cadc6af_subdural 0 ID_12cadc6af subdural"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Remove dupes "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(4516842, 4)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train.shape"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"df_train.drop_duplicates('ID', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(4516818, 4)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Remove corrupted image"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"df_train = df_train[df_train.fid != 'ID_6431af929'] # ID_6431af929"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(4516812, 4)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create pivot table with diagnostic labels as columns\n",
"Generates:\n",
"* `train_diags.csv` (previously named `train_pivot.csv`)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"df_diags = df_train.pivot(index='fid', columns='label', values='probability')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>label</th>\n",
" <th>any</th>\n",
" <th>epidural</th>\n",
" <th>intraparenchymal</th>\n",
" <th>intraventricular</th>\n",
" <th>subarachnoid</th>\n",
" <th>subdural</th>\n",
" </tr>\n",
" <tr>\n",
" <th>fid</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>ID_000012eaf</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ID_000039fa0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ID_00005679d</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ID_00008ce3c</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ID_0000950d7</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"label any epidural intraparenchymal intraventricular subarachnoid \\\n",
"fid \n",
"ID_000012eaf 0 0 0 0 0 \n",
"ID_000039fa0 0 0 0 0 0 \n",
"ID_00005679d 0 0 0 0 0 \n",
"ID_00008ce3c 0 0 0 0 0 \n",
"ID_0000950d7 0 0 0 0 0 \n",
"\n",
"label subdural \n",
"fid \n",
"ID_000012eaf 0 \n",
"ID_000039fa0 0 \n",
"ID_00005679d 0 \n",
"ID_00008ce3c 0 \n",
"ID_0000950d7 0 "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_diags.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(752802, 6)"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_diags.shape"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"df_diags.reset_index(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>label</th>\n",
" <th>fid</th>\n",
" <th>any</th>\n",
" <th>epidural</th>\n",
" <th>intraparenchymal</th>\n",
" <th>intraventricular</th>\n",
" <th>subarachnoid</th>\n",
" <th>subdural</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ID_000012eaf</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ID_000039fa0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ID_00005679d</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ID_00008ce3c</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ID_0000950d7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"label fid any epidural intraparenchymal intraventricular \\\n",
"0 ID_000012eaf 0 0 0 0 \n",
"1 ID_000039fa0 0 0 0 0 \n",
"2 ID_00005679d 0 0 0 0 \n",
"3 ID_00008ce3c 0 0 0 0 \n",
"4 ID_0000950d7 0 0 0 0 \n",
"\n",
"label subarachnoid subdural \n",
"0 0 0 \n",
"1 0 0 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_diags.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(752802, 7)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_diags.shape"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"df_diags.to_csv(f'data/{stage}_train_diags.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Generate fastai-ready csv image file (.png) -> labels\n",
"This is needed for early experiments that worked with the .png dataset.\n",
"\n",
"Generates:\n",
"* `train_labels_as_strings.csv`"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"from collections import defaultdict\n",
"\n",
"d = defaultdict(list)\n",
"for fid in df_train.fid.unique(): d[fid]\n",
"\n",
"for tup in df_train.itertuples():\n",
" if tup.probability: d[tup.fid].append(tup.label)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"ks, vs = [], []\n",
"\n",
"for k, v in d.items():\n",
" ks.append(k), vs.append(' '.join(v))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"fastai_df = pd.DataFrame(data={'fn': ks, 'labels': vs})"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(752802, 2)"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fastai_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"fastai_df['fn'] += '.png'"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>fn</th>\n",
" <th>labels</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ID_12cadc6af.png</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ID_38fd7baa0.png</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ID_6c5d82413.png</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ID_aec8e68b3.png</td>\n",
" <td>subarachnoid any</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ID_4d9209c7c.png</td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" fn labels\n",
"0 ID_12cadc6af.png \n",
"1 ID_38fd7baa0.png \n",
"2 ID_6c5d82413.png \n",
"3 ID_aec8e68b3.png subarachnoid any\n",
"4 ID_4d9209c7c.png "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fastai_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"fastai_df.to_csv(f'data/{stage}_train_labels_as_strings.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tabulate dicom data\n",
"Generates:\n",
"* `train_dicom.csv`\n",
"* `test_dicom.csv`"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"cols_i_want = ['BitsAllocated', 'BitsStored', 'Columns', 'HighBit', 'ImageOrientationPatient', \n",
" 'ImagePositionPatient', 'Modality', 'PatientID', 'PhotometricInterpretation', \n",
" 'PixelRepresentation', 'PixelSpacing', 'RescaleIntercept', 'RescaleSlope', \n",
" 'Rows', 'SOPInstanceUID', 'SamplesPerPixel', 'SeriesInstanceUID', 'StudyID', \n",
" 'StudyInstanceUID', 'WindowCenter', 'WindowWidth']\n",
"useless_cols = [ 'PixelData' ]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"train_dcm_list = list(Path(f'data/unzip/{stage}_train_images').glob('*.dcm'))"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"f = train_dcm_list[0]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0008, 0018) SOP Instance UID UI: ID_000012eaf\n",
"(0008, 0060) Modality CS: 'CT'\n",
"(0010, 0020) Patient ID LO: 'ID_f15c0eee'\n",
"(0020, 000d) Study Instance UID UI: ID_30ea2b02d4\n",
"(0020, 000e) Series Instance UID UI: ID_0ab5820b2a\n",
"(0020, 0010) Study ID SH: ''\n",
"(0020, 0032) Image Position (Patient) DS: ['-125.000000', '-115.897980', '77.970825']\n",
"(0020, 0037) Image Orientation (Patient) DS: ['1.000000', '0.000000', '0.000000', '0.000000', '0.927184', '-0.374607']\n",
"(0028, 0002) Samples per Pixel US: 1\n",
"(0028, 0004) Photometric Interpretation CS: 'MONOCHROME2'\n",
"(0028, 0010) Rows US: 512\n",
"(0028, 0011) Columns US: 512\n",
"(0028, 0030) Pixel Spacing DS: ['0.488281', '0.488281']\n",
"(0028, 0100) Bits Allocated US: 16\n",
"(0028, 0101) Bits Stored US: 16\n",
"(0028, 0102) High Bit US: 15\n",
"(0028, 0103) Pixel Representation US: 1\n",
"(0028, 1050) Window Center DS: \"30\"\n",
"(0028, 1051) Window Width DS: \"80\"\n",
"(0028, 1052) Rescale Intercept DS: \"-1024\"\n",
"(0028, 1053) Rescale Slope DS: \"1\"\n",
"(7fe0, 0010) Pixel Data OW: Array of 524288 elements"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dicom = pydicom.dcmread(str(f))\n",
"dicom"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'ID_f15c0eee'"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dicom.data_element('PatientID').value"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['-125.000000', '-115.897980', '77.970825']"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ipp = dicom.data_element('ImagePositionPatient').value\n",
"ipp"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"-125.000000\""
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ipp[0]"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pydicom.multival.MultiValue"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(ipp)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"def tabulate_dicom_data(file_list):\n",
" l = []\n",
" for f in file_list:\n",
" dicom = pydicom.dcmread(str(f))\n",
" d = {}\n",
" for s in cols_i_want:\n",
" v = dicom.data_element(s).value\n",
" if isinstance(v, pydicom.multival.MultiValue):\n",
" for i in range(len(v)):\n",
" d[s + '_' + str(i)] = v[i]\n",
" else:\n",
" d[s] = v\n",
" l.append(d)\n",
" \n",
" return l\n"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"with ProcessPoolExecutor(max_workers=32) as e:\n",
" l = list(itertools.chain.from_iterable(e.map(tabulate_dicom_data, np.array_split(train_dcm_list, 32))))"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"df_train_dicom = pd.DataFrame(l)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>BitsAllocated</th>\n",
" <th>BitsStored</th>\n",
" <th>Columns</th>\n",
" <th>HighBit</th>\n",
" <th>ImageOrientationPatient_0</th>\n",
" <th>ImageOrientationPatient_1</th>\n",
" <th>ImageOrientationPatient_2</th>\n",
" <th>ImageOrientationPatient_3</th>\n",
" <th>ImageOrientationPatient_4</th>\n",
" <th>ImageOrientationPatient_5</th>\n",
" <th>...</th>\n",
" <th>SamplesPerPixel</th>\n",
" <th>SeriesInstanceUID</th>\n",
" <th>StudyID</th>\n",
" <th>StudyInstanceUID</th>\n",
" <th>WindowCenter</th>\n",
" <th>WindowCenter_0</th>\n",
" <th>WindowCenter_1</th>\n",
" <th>WindowWidth</th>\n",
" <th>WindowWidth_0</th>\n",
" <th>WindowWidth_1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>512</td>\n",
" <td>15</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.927184</td>\n",
" <td>-0.374607</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>ID_0ab5820b2a</td>\n",
" <td></td>\n",
" <td>ID_30ea2b02d4</td>\n",
" <td>30.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>512</td>\n",
" <td>15</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.968148</td>\n",
" <td>-0.250380</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>ID_5f8484c3e0</td>\n",
" <td></td>\n",
" <td>ID_134d398b61</td>\n",
" <td>30.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>512</td>\n",
" <td>15</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>ID_203cd6ec46</td>\n",
" <td></td>\n",
" <td>ID_b5c26cda09</td>\n",
" <td>50.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>100.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>16</td>\n",
" <td>12</td>\n",
" <td>512</td>\n",
" <td>11</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.994522</td>\n",
" <td>0.104528</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>ID_3780d48b28</td>\n",
" <td></td>\n",
" <td>ID_974735bf79</td>\n",
" <td>NaN</td>\n",
" <td>40.0</td>\n",
" <td>40.0</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>80.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>512</td>\n",
" <td>15</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>ID_84296c3845</td>\n",
" <td></td>\n",
" <td>ID_8881b1c4b1</td>\n",
" <td>35.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>135.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 33 columns</p>\n",
"</div>"
],
"text/plain": [
" BitsAllocated BitsStored Columns HighBit ImageOrientationPatient_0 \\\n",
"0 16 16 512 15 1.0 \n",
"1 16 16 512 15 1.0 \n",
"2 16 16 512 15 1.0 \n",
"3 16 12 512 11 1.0 \n",
"4 16 16 512 15 1.0 \n",
"\n",
" ImageOrientationPatient_1 ImageOrientationPatient_2 \\\n",
"0 0.0 0.0 \n",
"1 0.0 0.0 \n",
"2 0.0 0.0 \n",
"3 0.0 0.0 \n",
"4 0.0 0.0 \n",
"\n",
" ImageOrientationPatient_3 ImageOrientationPatient_4 \\\n",
"0 0.0 0.927184 \n",
"1 0.0 0.968148 \n",
"2 0.0 1.000000 \n",
"3 0.0 0.994522 \n",
"4 0.0 1.000000 \n",
"\n",
" ImageOrientationPatient_5 ... SamplesPerPixel SeriesInstanceUID \\\n",
"0 -0.374607 ... 1 ID_0ab5820b2a \n",
"1 -0.250380 ... 1 ID_5f8484c3e0 \n",
"2 0.000000 ... 1 ID_203cd6ec46 \n",
"3 0.104528 ... 1 ID_3780d48b28 \n",
"4 0.000000 ... 1 ID_84296c3845 \n",
"\n",
" StudyID StudyInstanceUID WindowCenter WindowCenter_0 WindowCenter_1 \\\n",
"0 ID_30ea2b02d4 30.0 NaN NaN \n",
"1 ID_134d398b61 30.0 NaN NaN \n",
"2 ID_b5c26cda09 50.0 NaN NaN \n",
"3 ID_974735bf79 NaN 40.0 40.0 \n",
"4 ID_8881b1c4b1 35.0 NaN NaN \n",
"\n",
" WindowWidth WindowWidth_0 WindowWidth_1 \n",
"0 80.0 NaN NaN \n",
"1 80.0 NaN NaN \n",
"2 100.0 NaN NaN \n",
"3 NaN 80.0 80.0 \n",
"4 135.0 NaN NaN \n",
"\n",
"[5 rows x 33 columns]"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train_dicom.head()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"df_train_dicom.to_csv(f'data/{stage}_train_dicom.csv')"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"test_dcm_list = list(Path(f'data/unzip/{stage}_test_images').glob('*.dcm'))\n",
"with ProcessPoolExecutor(max_workers=32) as e:\n",
" l = list(itertools.chain.from_iterable(e.map(tabulate_dicom_data, np.array_split(test_dcm_list, 32))))"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>BitsAllocated</th>\n",
" <th>BitsStored</th>\n",
" <th>Columns</th>\n",
" <th>HighBit</th>\n",
" <th>ImageOrientationPatient_0</th>\n",
" <th>ImageOrientationPatient_1</th>\n",
" <th>ImageOrientationPatient_2</th>\n",
" <th>ImageOrientationPatient_3</th>\n",
" <th>ImageOrientationPatient_4</th>\n",
" <th>ImageOrientationPatient_5</th>\n",
" <th>...</th>\n",
" <th>SamplesPerPixel</th>\n",
" <th>SeriesInstanceUID</th>\n",
" <th>StudyID</th>\n",
" <th>StudyInstanceUID</th>\n",
" <th>WindowCenter</th>\n",
" <th>WindowCenter_0</th>\n",
" <th>WindowCenter_1</th>\n",
" <th>WindowWidth</th>\n",
" <th>WindowWidth_0</th>\n",
" <th>WindowWidth_1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>16</td>\n",
" <td>12</td>\n",
" <td>512</td>\n",
" <td>11</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.981627</td>\n",
" <td>-0.190809</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>ID_4d28912ba6</td>\n",
" <td></td>\n",
" <td>ID_1f6d1e8aeb</td>\n",
" <td>NaN</td>\n",
" <td>40.0</td>\n",
" <td>40.0</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>80.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>512</td>\n",
" <td>15</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.987688</td>\n",
" <td>-0.156434</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>ID_acabdeee86</td>\n",
" <td></td>\n",
" <td>ID_4a8d7ec19f</td>\n",
" <td>30.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>512</td>\n",
" <td>15</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.927184</td>\n",
" <td>-0.374607</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>ID_d00cee7f0c</td>\n",
" <td></td>\n",
" <td>ID_a6ca244172</td>\n",
" <td>30.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>512</td>\n",
" <td>15</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.986286</td>\n",
" <td>-0.165048</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>ID_a52a0112d5</td>\n",
" <td></td>\n",
" <td>ID_fa950a03af</td>\n",
" <td>30.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>16</td>\n",
" <td>12</td>\n",
" <td>512</td>\n",
" <td>11</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>ID_f552d3b922</td>\n",
" <td></td>\n",
" <td>ID_965d8b3d8e</td>\n",
" <td>NaN</td>\n",
" <td>36.0</td>\n",
" <td>36.0</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>80.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 33 columns</p>\n",
"</div>"
],
"text/plain": [
" BitsAllocated BitsStored Columns HighBit ImageOrientationPatient_0 \\\n",
"0 16 12 512 11 1.0 \n",
"1 16 16 512 15 1.0 \n",
"2 16 16 512 15 1.0 \n",
"3 16 16 512 15 1.0 \n",
"4 16 12 512 11 1.0 \n",
"\n",
" ImageOrientationPatient_1 ImageOrientationPatient_2 \\\n",
"0 0.0 0.0 \n",
"1 0.0 0.0 \n",
"2 0.0 0.0 \n",
"3 0.0 0.0 \n",
"4 0.0 0.0 \n",
"\n",
" ImageOrientationPatient_3 ImageOrientationPatient_4 \\\n",
"0 0.0 0.981627 \n",
"1 0.0 0.987688 \n",
"2 0.0 0.927184 \n",
"3 0.0 0.986286 \n",
"4 0.0 1.000000 \n",
"\n",
" ImageOrientationPatient_5 ... SamplesPerPixel SeriesInstanceUID \\\n",
"0 -0.190809 ... 1 ID_4d28912ba6 \n",
"1 -0.156434 ... 1 ID_acabdeee86 \n",
"2 -0.374607 ... 1 ID_d00cee7f0c \n",
"3 -0.165048 ... 1 ID_a52a0112d5 \n",
"4 0.000000 ... 1 ID_f552d3b922 \n",
"\n",
" StudyID StudyInstanceUID WindowCenter WindowCenter_0 WindowCenter_1 \\\n",
"0 ID_1f6d1e8aeb NaN 40.0 40.0 \n",
"1 ID_4a8d7ec19f 30.0 NaN NaN \n",
"2 ID_a6ca244172 30.0 NaN NaN \n",
"3 ID_fa950a03af 30.0 NaN NaN \n",
"4 ID_965d8b3d8e NaN 36.0 36.0 \n",
"\n",
" WindowWidth WindowWidth_0 WindowWidth_1 \n",
"0 NaN 80.0 80.0 \n",
"1 80.0 NaN NaN \n",
"2 80.0 NaN NaN \n",
"3 80.0 NaN NaN \n",
"4 NaN 80.0 80.0 \n",
"\n",
"[5 rows x 33 columns]"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_test_dicom = pd.DataFrame(l)\n",
"df_test_dicom.head()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"df_test_dicom.to_csv(f'data/{stage}_test_dicom.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Add labels to the train dicom csv\n",
"Generates:\n",
"* `train_dicom_diags.csv` (previously named `train_dicom_pivot.csv`)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"df_train_dicom_diags = pd.merge(df_train_dicom, df_diags, how='left', left_on=['SOPInstanceUID'], right_on = ['fid'])"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"assert len(df_train_dicom) == len(df_diags) == len(df_train_dicom_diags)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"df_train_dicom_diags.to_csv(f'data/{stage}_train_dicom_diags.csv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}