--- a +++ b/0-preprocess-generate_csvs.ipynb @@ -0,0 +1,1596 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pathlib import Path\n", + "from tqdm import tqdm_notebook\n", + "import pydicom\n", + "import itertools\n", + "import numpy as np\n", + "from concurrent.futures import ProcessPoolExecutor" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "from tqdm import tqdm_notebook as tqdm\n", + "tqdm().pandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Read stage_X_train and split id/label" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "stage = \"stage_2\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "bad_dcm_fn = f'data/unzip/{stage}_train_images/ID_6431af929.dcm'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rm: cannot remove 'data/unzip/stage_2_train_images/ID_6431af929.dcm': No such file or directory\r\n" + ] + } + ], + "source": [ + "!rm {bad_dcm_fn}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "df_train = pd.read_csv(f'data/unzip/{stage}_train.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ID</th>\n", + " <th>Label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>ID_12cadc6af_epidural</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>ID_12cadc6af_intraparenchymal</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>ID_12cadc6af_intraventricular</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>ID_12cadc6af_subarachnoid</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>ID_12cadc6af_subdural</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ID Label\n", + "0 ID_12cadc6af_epidural 0\n", + "1 ID_12cadc6af_intraparenchymal 0\n", + "2 ID_12cadc6af_intraventricular 0\n", + "3 ID_12cadc6af_subarachnoid 0\n", + "4 ID_12cadc6af_subdural 0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df_train['fid'] = df_train.ID.apply(lambda x: '_'.join(x.split('_')[:2]) )" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "df_train.columns = ['ID', 'probability', 'fid']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df_train['label'] = df_train.ID.apply(lambda x: x.split('_')[-1])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ID</th>\n", + " <th>probability</th>\n", + " <th>fid</th>\n", + " <th>label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>ID_12cadc6af_epidural</td>\n", + " <td>0</td>\n", + " <td>ID_12cadc6af</td>\n", + " <td>epidural</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>ID_12cadc6af_intraparenchymal</td>\n", + " <td>0</td>\n", + " <td>ID_12cadc6af</td>\n", + " <td>intraparenchymal</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>ID_12cadc6af_intraventricular</td>\n", + " <td>0</td>\n", + " <td>ID_12cadc6af</td>\n", + " <td>intraventricular</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>ID_12cadc6af_subarachnoid</td>\n", + " <td>0</td>\n", + " <td>ID_12cadc6af</td>\n", + " <td>subarachnoid</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>ID_12cadc6af_subdural</td>\n", + " <td>0</td>\n", + " <td>ID_12cadc6af</td>\n", + " <td>subdural</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ID probability fid label\n", + "0 ID_12cadc6af_epidural 0 ID_12cadc6af epidural\n", + "1 ID_12cadc6af_intraparenchymal 0 ID_12cadc6af intraparenchymal\n", + "2 ID_12cadc6af_intraventricular 0 ID_12cadc6af intraventricular\n", + "3 ID_12cadc6af_subarachnoid 0 ID_12cadc6af subarachnoid\n", + "4 ID_12cadc6af_subdural 0 ID_12cadc6af subdural" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Remove dupes " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(4516842, 4)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "df_train.drop_duplicates('ID', inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(4516818, 4)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Remove corrupted image" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "df_train = df_train[df_train.fid != 'ID_6431af929'] # ID_6431af929" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(4516812, 4)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create pivot table with diagnostic labels as columns\n", + "Generates:\n", + "* `train_diags.csv` (previously named `train_pivot.csv`)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "df_diags = df_train.pivot(index='fid', columns='label', values='probability')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th>label</th>\n", + " <th>any</th>\n", + " <th>epidural</th>\n", + " <th>intraparenchymal</th>\n", + " <th>intraventricular</th>\n", + " <th>subarachnoid</th>\n", + " <th>subdural</th>\n", + " </tr>\n", + " <tr>\n", + " <th>fid</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>ID_000012eaf</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_000039fa0</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_00005679d</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_00008ce3c</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_0000950d7</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "label any epidural intraparenchymal intraventricular subarachnoid \\\n", + "fid \n", + "ID_000012eaf 0 0 0 0 0 \n", + "ID_000039fa0 0 0 0 0 0 \n", + "ID_00005679d 0 0 0 0 0 \n", + "ID_00008ce3c 0 0 0 0 0 \n", + "ID_0000950d7 0 0 0 0 0 \n", + "\n", + "label subdural \n", + "fid \n", + "ID_000012eaf 0 \n", + "ID_000039fa0 0 \n", + "ID_00005679d 0 \n", + "ID_00008ce3c 0 \n", + "ID_0000950d7 0 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_diags.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(752802, 6)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_diags.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "df_diags.reset_index(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th>label</th>\n", + " <th>fid</th>\n", + " <th>any</th>\n", + " <th>epidural</th>\n", + " <th>intraparenchymal</th>\n", + " <th>intraventricular</th>\n", + " <th>subarachnoid</th>\n", + " <th>subdural</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>ID_000012eaf</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>ID_000039fa0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>ID_00005679d</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>ID_00008ce3c</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>ID_0000950d7</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "label fid any epidural intraparenchymal intraventricular \\\n", + "0 ID_000012eaf 0 0 0 0 \n", + "1 ID_000039fa0 0 0 0 0 \n", + "2 ID_00005679d 0 0 0 0 \n", + "3 ID_00008ce3c 0 0 0 0 \n", + "4 ID_0000950d7 0 0 0 0 \n", + "\n", + "label subarachnoid subdural \n", + "0 0 0 \n", + "1 0 0 \n", + "2 0 0 \n", + "3 0 0 \n", + "4 0 0 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_diags.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(752802, 7)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_diags.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "df_diags.to_csv(f'data/{stage}_train_diags.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate fastai-ready csv image file (.png) -> labels\n", + "This is needed for early experiments that worked with the .png dataset.\n", + "\n", + "Generates:\n", + "* `train_labels_as_strings.csv`" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "d = defaultdict(list)\n", + "for fid in df_train.fid.unique(): d[fid]\n", + "\n", + "for tup in df_train.itertuples():\n", + " if tup.probability: d[tup.fid].append(tup.label)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "ks, vs = [], []\n", + "\n", + "for k, v in d.items():\n", + " ks.append(k), vs.append(' '.join(v))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "fastai_df = pd.DataFrame(data={'fn': ks, 'labels': vs})" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(752802, 2)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fastai_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "fastai_df['fn'] += '.png'" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>fn</th>\n", + " <th>labels</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>ID_12cadc6af.png</td>\n", + " <td></td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>ID_38fd7baa0.png</td>\n", + " <td></td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>ID_6c5d82413.png</td>\n", + " <td></td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>ID_aec8e68b3.png</td>\n", + " <td>subarachnoid any</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>ID_4d9209c7c.png</td>\n", + " <td></td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " fn labels\n", + "0 ID_12cadc6af.png \n", + "1 ID_38fd7baa0.png \n", + "2 ID_6c5d82413.png \n", + "3 ID_aec8e68b3.png subarachnoid any\n", + "4 ID_4d9209c7c.png " + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fastai_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "fastai_df.to_csv(f'data/{stage}_train_labels_as_strings.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tabulate dicom data\n", + "Generates:\n", + "* `train_dicom.csv`\n", + "* `test_dicom.csv`" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "cols_i_want = ['BitsAllocated', 'BitsStored', 'Columns', 'HighBit', 'ImageOrientationPatient', \n", + " 'ImagePositionPatient', 'Modality', 'PatientID', 'PhotometricInterpretation', \n", + " 'PixelRepresentation', 'PixelSpacing', 'RescaleIntercept', 'RescaleSlope', \n", + " 'Rows', 'SOPInstanceUID', 'SamplesPerPixel', 'SeriesInstanceUID', 'StudyID', \n", + " 'StudyInstanceUID', 'WindowCenter', 'WindowWidth']\n", + "useless_cols = [ 'PixelData' ]" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "train_dcm_list = list(Path(f'data/unzip/{stage}_train_images').glob('*.dcm'))" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "f = train_dcm_list[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0008, 0018) SOP Instance UID UI: ID_000012eaf\n", + "(0008, 0060) Modality CS: 'CT'\n", + "(0010, 0020) Patient ID LO: 'ID_f15c0eee'\n", + "(0020, 000d) Study Instance UID UI: ID_30ea2b02d4\n", + "(0020, 000e) Series Instance UID UI: ID_0ab5820b2a\n", + "(0020, 0010) Study ID SH: ''\n", + "(0020, 0032) Image Position (Patient) DS: ['-125.000000', '-115.897980', '77.970825']\n", + "(0020, 0037) Image Orientation (Patient) DS: ['1.000000', '0.000000', '0.000000', '0.000000', '0.927184', '-0.374607']\n", + "(0028, 0002) Samples per Pixel US: 1\n", + "(0028, 0004) Photometric Interpretation CS: 'MONOCHROME2'\n", + "(0028, 0010) Rows US: 512\n", + "(0028, 0011) Columns US: 512\n", + "(0028, 0030) Pixel Spacing DS: ['0.488281', '0.488281']\n", + "(0028, 0100) Bits Allocated US: 16\n", + "(0028, 0101) Bits Stored US: 16\n", + "(0028, 0102) High Bit US: 15\n", + "(0028, 0103) Pixel Representation US: 1\n", + "(0028, 1050) Window Center DS: \"30\"\n", + "(0028, 1051) Window Width DS: \"80\"\n", + "(0028, 1052) Rescale Intercept DS: \"-1024\"\n", + "(0028, 1053) Rescale Slope DS: \"1\"\n", + "(7fe0, 0010) Pixel Data OW: Array of 524288 elements" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dicom = pydicom.dcmread(str(f))\n", + "dicom" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'ID_f15c0eee'" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dicom.data_element('PatientID').value" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['-125.000000', '-115.897980', '77.970825']" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ipp = dicom.data_element('ImagePositionPatient').value\n", + "ipp" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"-125.000000\"" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ipp[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pydicom.multival.MultiValue" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(ipp)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "def tabulate_dicom_data(file_list):\n", + " l = []\n", + " for f in file_list:\n", + " dicom = pydicom.dcmread(str(f))\n", + " d = {}\n", + " for s in cols_i_want:\n", + " v = dicom.data_element(s).value\n", + " if isinstance(v, pydicom.multival.MultiValue):\n", + " for i in range(len(v)):\n", + " d[s + '_' + str(i)] = v[i]\n", + " else:\n", + " d[s] = v\n", + " l.append(d)\n", + " \n", + " return l\n" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "with ProcessPoolExecutor(max_workers=32) as e:\n", + " l = list(itertools.chain.from_iterable(e.map(tabulate_dicom_data, np.array_split(train_dcm_list, 32))))" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "df_train_dicom = pd.DataFrame(l)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>BitsAllocated</th>\n", + " <th>BitsStored</th>\n", + " <th>Columns</th>\n", + " <th>HighBit</th>\n", + " <th>ImageOrientationPatient_0</th>\n", + " <th>ImageOrientationPatient_1</th>\n", + " <th>ImageOrientationPatient_2</th>\n", + " <th>ImageOrientationPatient_3</th>\n", + " <th>ImageOrientationPatient_4</th>\n", + " <th>ImageOrientationPatient_5</th>\n", + " <th>...</th>\n", + " <th>SamplesPerPixel</th>\n", + " <th>SeriesInstanceUID</th>\n", + " <th>StudyID</th>\n", + " <th>StudyInstanceUID</th>\n", + " <th>WindowCenter</th>\n", + " <th>WindowCenter_0</th>\n", + " <th>WindowCenter_1</th>\n", + " <th>WindowWidth</th>\n", + " <th>WindowWidth_0</th>\n", + " <th>WindowWidth_1</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>512</td>\n", + " <td>15</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.927184</td>\n", + " <td>-0.374607</td>\n", + " <td>...</td>\n", + " <td>1</td>\n", + " <td>ID_0ab5820b2a</td>\n", + " <td></td>\n", + " <td>ID_30ea2b02d4</td>\n", + " <td>30.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>80.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>512</td>\n", + " <td>15</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.968148</td>\n", + " <td>-0.250380</td>\n", + " <td>...</td>\n", + " <td>1</td>\n", + " <td>ID_5f8484c3e0</td>\n", + " <td></td>\n", + " <td>ID_134d398b61</td>\n", + " <td>30.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>80.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>512</td>\n", + " <td>15</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>1</td>\n", + " <td>ID_203cd6ec46</td>\n", + " <td></td>\n", + " <td>ID_b5c26cda09</td>\n", + " <td>50.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>100.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>16</td>\n", + " <td>12</td>\n", + " <td>512</td>\n", + " <td>11</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.994522</td>\n", + " <td>0.104528</td>\n", + " <td>...</td>\n", + " <td>1</td>\n", + " <td>ID_3780d48b28</td>\n", + " <td></td>\n", + " <td>ID_974735bf79</td>\n", + " <td>NaN</td>\n", + " <td>40.0</td>\n", + " <td>40.0</td>\n", + " <td>NaN</td>\n", + " <td>80.0</td>\n", + " <td>80.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>512</td>\n", + " <td>15</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>1</td>\n", + " <td>ID_84296c3845</td>\n", + " <td></td>\n", + " <td>ID_8881b1c4b1</td>\n", + " <td>35.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>135.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 33 columns</p>\n", + "</div>" + ], + "text/plain": [ + " BitsAllocated BitsStored Columns HighBit ImageOrientationPatient_0 \\\n", + "0 16 16 512 15 1.0 \n", + "1 16 16 512 15 1.0 \n", + "2 16 16 512 15 1.0 \n", + "3 16 12 512 11 1.0 \n", + "4 16 16 512 15 1.0 \n", + "\n", + " ImageOrientationPatient_1 ImageOrientationPatient_2 \\\n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "\n", + " ImageOrientationPatient_3 ImageOrientationPatient_4 \\\n", + "0 0.0 0.927184 \n", + "1 0.0 0.968148 \n", + "2 0.0 1.000000 \n", + "3 0.0 0.994522 \n", + "4 0.0 1.000000 \n", + "\n", + " ImageOrientationPatient_5 ... SamplesPerPixel SeriesInstanceUID \\\n", + "0 -0.374607 ... 1 ID_0ab5820b2a \n", + "1 -0.250380 ... 1 ID_5f8484c3e0 \n", + "2 0.000000 ... 1 ID_203cd6ec46 \n", + "3 0.104528 ... 1 ID_3780d48b28 \n", + "4 0.000000 ... 1 ID_84296c3845 \n", + "\n", + " StudyID StudyInstanceUID WindowCenter WindowCenter_0 WindowCenter_1 \\\n", + "0 ID_30ea2b02d4 30.0 NaN NaN \n", + "1 ID_134d398b61 30.0 NaN NaN \n", + "2 ID_b5c26cda09 50.0 NaN NaN \n", + "3 ID_974735bf79 NaN 40.0 40.0 \n", + "4 ID_8881b1c4b1 35.0 NaN NaN \n", + "\n", + " WindowWidth WindowWidth_0 WindowWidth_1 \n", + "0 80.0 NaN NaN \n", + "1 80.0 NaN NaN \n", + "2 100.0 NaN NaN \n", + "3 NaN 80.0 80.0 \n", + "4 135.0 NaN NaN \n", + "\n", + "[5 rows x 33 columns]" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train_dicom.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "df_train_dicom.to_csv(f'data/{stage}_train_dicom.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "test_dcm_list = list(Path(f'data/unzip/{stage}_test_images').glob('*.dcm'))\n", + "with ProcessPoolExecutor(max_workers=32) as e:\n", + " l = list(itertools.chain.from_iterable(e.map(tabulate_dicom_data, np.array_split(test_dcm_list, 32))))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>BitsAllocated</th>\n", + " <th>BitsStored</th>\n", + " <th>Columns</th>\n", + " <th>HighBit</th>\n", + " <th>ImageOrientationPatient_0</th>\n", + " <th>ImageOrientationPatient_1</th>\n", + " <th>ImageOrientationPatient_2</th>\n", + " <th>ImageOrientationPatient_3</th>\n", + " <th>ImageOrientationPatient_4</th>\n", + " <th>ImageOrientationPatient_5</th>\n", + " <th>...</th>\n", + " <th>SamplesPerPixel</th>\n", + " <th>SeriesInstanceUID</th>\n", + " <th>StudyID</th>\n", + " <th>StudyInstanceUID</th>\n", + " <th>WindowCenter</th>\n", + " <th>WindowCenter_0</th>\n", + " <th>WindowCenter_1</th>\n", + " <th>WindowWidth</th>\n", + " <th>WindowWidth_0</th>\n", + " <th>WindowWidth_1</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>16</td>\n", + " <td>12</td>\n", + " <td>512</td>\n", + " <td>11</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.981627</td>\n", + " <td>-0.190809</td>\n", + " <td>...</td>\n", + " <td>1</td>\n", + " <td>ID_4d28912ba6</td>\n", + " <td></td>\n", + " <td>ID_1f6d1e8aeb</td>\n", + " <td>NaN</td>\n", + " <td>40.0</td>\n", + " <td>40.0</td>\n", + " <td>NaN</td>\n", + " <td>80.0</td>\n", + " <td>80.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>512</td>\n", + " <td>15</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.987688</td>\n", + " <td>-0.156434</td>\n", + " <td>...</td>\n", + " <td>1</td>\n", + " <td>ID_acabdeee86</td>\n", + " <td></td>\n", + " <td>ID_4a8d7ec19f</td>\n", + " <td>30.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>80.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>512</td>\n", + " <td>15</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.927184</td>\n", + " <td>-0.374607</td>\n", + " <td>...</td>\n", + " <td>1</td>\n", + " <td>ID_d00cee7f0c</td>\n", + " <td></td>\n", + " <td>ID_a6ca244172</td>\n", + " <td>30.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>80.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>512</td>\n", + " <td>15</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.986286</td>\n", + " <td>-0.165048</td>\n", + " <td>...</td>\n", + " <td>1</td>\n", + " <td>ID_a52a0112d5</td>\n", + " <td></td>\n", + " <td>ID_fa950a03af</td>\n", + " <td>30.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>80.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>16</td>\n", + " <td>12</td>\n", + " <td>512</td>\n", + " <td>11</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>1</td>\n", + " <td>ID_f552d3b922</td>\n", + " <td></td>\n", + " <td>ID_965d8b3d8e</td>\n", + " <td>NaN</td>\n", + " <td>36.0</td>\n", + " <td>36.0</td>\n", + " <td>NaN</td>\n", + " <td>80.0</td>\n", + " <td>80.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 33 columns</p>\n", + "</div>" + ], + "text/plain": [ + " BitsAllocated BitsStored Columns HighBit ImageOrientationPatient_0 \\\n", + "0 16 12 512 11 1.0 \n", + "1 16 16 512 15 1.0 \n", + "2 16 16 512 15 1.0 \n", + "3 16 16 512 15 1.0 \n", + "4 16 12 512 11 1.0 \n", + "\n", + " ImageOrientationPatient_1 ImageOrientationPatient_2 \\\n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "\n", + " ImageOrientationPatient_3 ImageOrientationPatient_4 \\\n", + "0 0.0 0.981627 \n", + "1 0.0 0.987688 \n", + "2 0.0 0.927184 \n", + "3 0.0 0.986286 \n", + "4 0.0 1.000000 \n", + "\n", + " ImageOrientationPatient_5 ... SamplesPerPixel SeriesInstanceUID \\\n", + "0 -0.190809 ... 1 ID_4d28912ba6 \n", + "1 -0.156434 ... 1 ID_acabdeee86 \n", + "2 -0.374607 ... 1 ID_d00cee7f0c \n", + "3 -0.165048 ... 1 ID_a52a0112d5 \n", + "4 0.000000 ... 1 ID_f552d3b922 \n", + "\n", + " StudyID StudyInstanceUID WindowCenter WindowCenter_0 WindowCenter_1 \\\n", + "0 ID_1f6d1e8aeb NaN 40.0 40.0 \n", + "1 ID_4a8d7ec19f 30.0 NaN NaN \n", + "2 ID_a6ca244172 30.0 NaN NaN \n", + "3 ID_fa950a03af 30.0 NaN NaN \n", + "4 ID_965d8b3d8e NaN 36.0 36.0 \n", + "\n", + " WindowWidth WindowWidth_0 WindowWidth_1 \n", + "0 NaN 80.0 80.0 \n", + "1 80.0 NaN NaN \n", + "2 80.0 NaN NaN \n", + "3 80.0 NaN NaN \n", + "4 NaN 80.0 80.0 \n", + "\n", + "[5 rows x 33 columns]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_test_dicom = pd.DataFrame(l)\n", + "df_test_dicom.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "df_test_dicom.to_csv(f'data/{stage}_test_dicom.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Add labels to the train dicom csv\n", + "Generates:\n", + "* `train_dicom_diags.csv` (previously named `train_dicom_pivot.csv`)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "df_train_dicom_diags = pd.merge(df_train_dicom, df_diags, how='left', left_on=['SOPInstanceUID'], right_on = ['fid'])" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "assert len(df_train_dicom) == len(df_diags) == len(df_train_dicom_diags)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "df_train_dicom_diags.to_csv(f'data/{stage}_train_dicom_diags.csv')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}