--- a +++ b/Serialized/Prepare.ipynb @@ -0,0 +1,764 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/reina/anaconda3/envs/RSNA/lib/python3.6/importlib/_bootstrap.py:219: ImportWarning: can't resolve package from __spec__ or __package__, falling back on __name__ and __path__\n", + " return f(*args, **kwds)\n", + "/home/reina/anaconda3/envs/RSNA/lib/python3.6/importlib/_bootstrap.py:219: ImportWarning: can't resolve package from __spec__ or __package__, falling back on __name__ and __path__\n", + " return f(*args, **kwds)\n" + ] + } + ], + "source": [ + "from __future__ import absolute_import\n", + "from __future__ import division\n", + "from __future__ import print_function\n", + "\n", + "import glob, pylab, pandas as pd\n", + "import pydicom, numpy as np\n", + "from os import listdir\n", + "from os.path import isfile, join\n", + "import matplotlib.pylab as plt\n", + "import os\n", + "import seaborn as sns\n", + "import warnings\n", + "warnings.filterwarnings(action='once')\n", + "import pickle\n", + "from tqdm import tqdm, tqdm_notebook\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib notebook\n", + "from helper import *\n", + "import time\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "InteractiveShell.ast_node_interactivity = \"all\"\n", + "from defenitions import *" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "train_base_df = pd.read_csv(train_images_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4045572" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_base_df.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ID</th>\n", + " <th>Label</th>\n", + " <th>Sub_type</th>\n", + " <th>PatientID</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>ID_28fbab7eb_epidural</td>\n", + " <td>0.5</td>\n", + " <td>epidural</td>\n", + " <td>28fbab7eb</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>ID_28fbab7eb_intraparenchymal</td>\n", + " <td>0.5</td>\n", + " <td>intraparenchymal</td>\n", + " <td>28fbab7eb</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>ID_28fbab7eb_intraventricular</td>\n", + " <td>0.5</td>\n", + " <td>intraventricular</td>\n", + " <td>28fbab7eb</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>ID_28fbab7eb_subarachnoid</td>\n", + " <td>0.5</td>\n", + " <td>subarachnoid</td>\n", + " <td>28fbab7eb</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>ID_28fbab7eb_subdural</td>\n", + " <td>0.5</td>\n", + " <td>subdural</td>\n", + " <td>28fbab7eb</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ID Label Sub_type PatientID\n", + "0 ID_28fbab7eb_epidural 0.5 epidural 28fbab7eb\n", + "1 ID_28fbab7eb_intraparenchymal 0.5 intraparenchymal 28fbab7eb\n", + "2 ID_28fbab7eb_intraventricular 0.5 intraventricular 28fbab7eb\n", + "3 ID_28fbab7eb_subarachnoid 0.5 subarachnoid 28fbab7eb\n", + "4 ID_28fbab7eb_subdural 0.5 subdural 28fbab7eb" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_base_df['Sub_type'] = train_base_df['ID'].str.split(\"_\", n = 3, expand = True)[2]\n", + "train_base_df['PatientID'] = train_base_df['ID'].str.split(\"_\", n = 3, expand = True)[1]\n", + "train_base_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid',\n", + " 'subdural', 'any'], dtype=object)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sub_types=train_base_df.Sub_type.unique()\n", + "sub_types" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ae4afd5b0d824567b34dd59b1b81e430", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=6), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "dfs =[]\n", + "for sub_type in tqdm_notebook(sub_types):\n", + " df = train_base_df[train_base_df.Sub_type==sub_type][['PatientID','Label']].copy()\n", + " df=df.rename(columns={\"Label\": sub_type}).reset_index(drop=True)\n", + " dfs.append(df)\n", + "train_df=dfs[0]\n", + "for df in tqdm_notebook(dfs[1:]):\n", + " train_df=test_df.merge(df,on='PatientID')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5fe21e059b0f4fff9decf2a356119816", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=78545), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "PID = np.zeros(train_df.shape[0],dtype=object)\n", + "StudyI = np.zeros(train_df.shape[0],dtype=object)\n", + "SeriesI = np.zeros(train_df.shape[0],dtype=object)\n", + "WindowCenter = np.zeros(train_df.shape[0],dtype=object)\n", + "WindowWidth = np.zeros(train_df.shape[0],dtype=object)\n", + "ImagePositionX = np.zeros(train_df.shape[0],dtype=np.float)\n", + "ImagePositionY = np.zeros(train_df.shape[0],dtype=np.float)\n", + "ImagePositionZ = np.zeros(train_df.shape[0],dtype=np.float)\n", + "\n", + "for i,row in tqdm_notebook(train_df.iterrows(),total=train_df.shape[0]):\n", + " ds = pydicom.dcmread(train_images_dir + 'ID_{}.dcm'.format(row['PatientID']))\n", + " SeriesI[i]=ds.SeriesInstanceUID\n", + " PID[i]=ds.PatientID\n", + " StudyI[i]=ds.StudyInstanceUID \n", + " WindowCenter[i]=ds.WindowCenter\n", + " WindowWidth[i]=ds.WindowWidth\n", + " ImagePositionX[i]=float(ds.ImagePositionPatient[0])\n", + " ImagePositionY[i]=float(ds.ImagePositionPatient[1])\n", + " ImagePositionZ[i]=float(ds.ImagePositionPatient[2])\n", + "train_df['SeriesI']=SeriesI\n", + "train_df['PID']=PID\n", + "train_df['StudyI']=StudyI\n", + "train_df['WindowCenter']=WindowCenter\n", + "train_df['WindowWidth']=WindowWidth\n", + "train_df['ImagePositionZ']=ImagePositionZ\n", + "train_df['ImagePositionX']=ImagePositionX\n", + "train_df['ImagePositionY']=ImagePositionY\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_df.to_csv(data_dir+'train.csv',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ID</th>\n", + " <th>Label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>ID_28fbab7eb_epidural</td>\n", + " <td>0.5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>ID_28fbab7eb_intraparenchymal</td>\n", + " <td>0.5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>ID_28fbab7eb_intraventricular</td>\n", + " <td>0.5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>ID_28fbab7eb_subarachnoid</td>\n", + " <td>0.5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>ID_28fbab7eb_subdural</td>\n", + " <td>0.5</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ID Label\n", + "0 ID_28fbab7eb_epidural 0.5\n", + "1 ID_28fbab7eb_intraparenchymal 0.5\n", + "2 ID_28fbab7eb_intraventricular 0.5\n", + "3 ID_28fbab7eb_subarachnoid 0.5\n", + "4 ID_28fbab7eb_subdural 0.5" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_submission=pd.read_csv(data_dir+'stage_1_sample_submission.csv')\n", + "sample_submission.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ID</th>\n", + " <th>Label</th>\n", + " <th>Sub_type</th>\n", + " <th>PatientID</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>ID_28fbab7eb_epidural</td>\n", + " <td>0.5</td>\n", + " <td>epidural</td>\n", + " <td>28fbab7eb</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>ID_28fbab7eb_intraparenchymal</td>\n", + " <td>0.5</td>\n", + " <td>intraparenchymal</td>\n", + " <td>28fbab7eb</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>ID_28fbab7eb_intraventricular</td>\n", + " <td>0.5</td>\n", + " <td>intraventricular</td>\n", + " <td>28fbab7eb</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>ID_28fbab7eb_subarachnoid</td>\n", + " <td>0.5</td>\n", + " <td>subarachnoid</td>\n", + " <td>28fbab7eb</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>ID_28fbab7eb_subdural</td>\n", + " <td>0.5</td>\n", + " <td>subdural</td>\n", + " <td>28fbab7eb</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ID Label Sub_type PatientID\n", + "0 ID_28fbab7eb_epidural 0.5 epidural 28fbab7eb\n", + "1 ID_28fbab7eb_intraparenchymal 0.5 intraparenchymal 28fbab7eb\n", + "2 ID_28fbab7eb_intraventricular 0.5 intraventricular 28fbab7eb\n", + "3 ID_28fbab7eb_subarachnoid 0.5 subarachnoid 28fbab7eb\n", + "4 ID_28fbab7eb_subdural 0.5 subdural 28fbab7eb" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_base_df=sample_submission.copy()\n", + "test_base_df['Sub_type'] = test_base_df['ID'].str.split(\"_\", n = 3, expand = True)[2]\n", + "test_base_df['PatientID'] = test_base_df['ID'].str.split(\"_\", n = 3, expand = True)[1]\n", + "test_base_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 234, + "metadata": {}, + "outputs": [], + "source": [ + "test_ids=test_df.PatientID.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid',\n", + " 'subdural', 'any'], dtype=object)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sub_types=test_base_df.Sub_type.unique()\n", + "sub_types" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f153620a2b61441bb7cb7f1acf392844", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=6), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "189e8ac701514c2c963161608d43a459", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=5), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "dfs =[]\n", + "for sub_type in tqdm_notebook(sub_types):\n", + " df = test_base_df[test_base_df.Sub_type==sub_type][['PatientID','Label']].copy()\n", + " df=df.rename(columns={\"Label\": sub_type}).reset_index(drop=True)\n", + " dfs.append(df)\n", + "test_df=dfs[0]\n", + "for df in tqdm_notebook(dfs[1:]):\n", + " test_df=test_df.merge(df,on='PatientID')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>PatientID</th>\n", + " <th>epidural</th>\n", + " <th>intraparenchymal</th>\n", + " <th>intraventricular</th>\n", + " <th>subarachnoid</th>\n", + " <th>subdural</th>\n", + " <th>any</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>28fbab7eb</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>877923b8b</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>a591477cb</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42217c898</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>a130c4d2f</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " <td>0.5</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " PatientID epidural intraparenchymal intraventricular subarachnoid \\\n", + "0 28fbab7eb 0.5 0.5 0.5 0.5 \n", + "1 877923b8b 0.5 0.5 0.5 0.5 \n", + "2 a591477cb 0.5 0.5 0.5 0.5 \n", + "3 42217c898 0.5 0.5 0.5 0.5 \n", + "4 a130c4d2f 0.5 0.5 0.5 0.5 \n", + "\n", + " subdural any \n", + "0 0.5 0.5 \n", + "1 0.5 0.5 \n", + "2 0.5 0.5 \n", + "3 0.5 0.5 \n", + "4 0.5 0.5 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5fe21e059b0f4fff9decf2a356119816", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=78545), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "PID = np.zeros(test_df.shape[0],dtype=object)\n", + "StudyI = np.zeros(test_df.shape[0],dtype=object)\n", + "SeriesI = np.zeros(test_df.shape[0],dtype=object)\n", + "WindowCenter = np.zeros(test_df.shape[0],dtype=object)\n", + "WindowWidth = np.zeros(test_df.shape[0],dtype=object)\n", + "ImagePositionX = np.zeros(test_df.shape[0],dtype=np.float)\n", + "ImagePositionY = np.zeros(test_df.shape[0],dtype=np.float)\n", + "ImagePositionZ = np.zeros(test_df.shape[0],dtype=np.float)\n", + "\n", + "for i,row in tqdm_notebook(test_df.iterrows(),total=test_df.shape[0]):\n", + " ds = pydicom.dcmread(test_images_dir + 'ID_{}.dcm'.format(row['PatientID']))\n", + " SeriesI[i]=ds.SeriesInstanceUID\n", + " PID[i]=ds.PatientID\n", + " StudyI[i]=ds.StudyInstanceUID \n", + " WindowCenter[i]=ds.WindowCenter\n", + " WindowWidth[i]=ds.WindowWidth\n", + " ImagePositionX[i]=float(ds.ImagePositionPatient[0])\n", + " ImagePositionY[i]=float(ds.ImagePositionPatient[1])\n", + " ImagePositionZ[i]=float(ds.ImagePositionPatient[2])\n", + "test_df['SeriesI']=SeriesI\n", + "test_df['PID']=PID\n", + "test_df['StudyI']=StudyI\n", + "test_df['WindowCenter']=WindowCenter\n", + "test_df['WindowWidth']=WindowWidth\n", + "test_df['ImagePositionZ']=ImagePositionZ\n", + "test_df['ImagePositionX']=ImagePositionX\n", + "test_df['ImagePositionY']=ImagePositionY\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "test_df.to_csv(data_dir+'test.csv',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}