--- a +++ b/eda/hunt-for-bad-actors.ipynb @@ -0,0 +1,533 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can read the DICOM metadata - especially \"Window Center\", \"Window Width\", \"Rescale Intercept\" and \"Rescale Slope\" - for data pre-processing.\n", + "Head CT are a center of 40 and a width of 80" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "from multiprocessing import Process\n", + "import concurrent.futures\n", + "import ast\n", + "import csv\n", + "import numpy as np\n", + "import pandas as pd\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pydicom\n", + "import re\n", + "import PIL\n", + "from PIL import Image\n", + "from random import randrange\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "# import seaborn as sns\n", + "\n", + "train_csv_path = '../src/training.csv'\n", + "validate_csv_path = '../src/validation.csv'\n", + "test_csv_path = '../src/testing.csv'\n", + "\n", + "tony_data_path = ''\n", + "chris_data_path = ''\n", + "kyle_data_path = '/media/keil/baltar/intracranial-hemorrhage-detection-data/stage_1_train_images/'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#dicoms to avoid\n", + "bad_actors = []" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9ea75e7e82e54a8d9a809ec1701e4a09", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=674258), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ID_dfa4e344a.dcm (638, 490)\n", + "ID_12e3b6923.dcm (430, 404)\n", + "ID_cef2af72d.dcm (638, 490)\n", + "ID_046ba342c.dcm (638, 490)\n", + "ID_680b2194c.dcm (666, 512)\n", + "ID_1bb3b44c7.dcm (638, 490)\n", + "ID_567a36143.dcm (436, 436)\n", + "ID_b4adf8739.dcm (462, 462)\n", + "ID_8d0ca7742.dcm (430, 404)\n", + "ID_db48a633d.dcm (666, 512)\n", + "ID_9cdc7295b.dcm (768, 768)\n", + "ID_c964e4096.dcm (666, 512)\n", + "ID_ca9462f49.dcm (462, 462)\n", + "ID_191369dca.dcm (408, 374)\n", + "ID_ac39010dc.dcm (430, 404)\n", + "ID_0e9ac1c5f.dcm (464, 464)\n", + "ID_d9840380c.dcm (464, 464)\n", + "ID_66131f4c9.dcm (408, 374)\n", + "ID_676b0cb59.dcm (768, 768)\n", + "ID_b76b13444.dcm (666, 512)\n", + "ID_898ff55b6.dcm (462, 462)\n", + "ID_37c495912.dcm (462, 462)\n", + "ID_a432727fd.dcm (464, 464)\n", + "ID_1bc5771a7.dcm (638, 490)\n", + "ID_69974dd3e.dcm (408, 374)\n", + "ID_3e31d57d0.dcm (666, 512)\n", + "ID_91c508c7a.dcm (462, 462)\n", + "ID_c35d5c858.dcm (430, 404)\n", + "ID_d6435f3bf.dcm (666, 512)\n", + "ID_b77ba3355.dcm (464, 464)\n", + "ID_a23a8193f.dcm (638, 490)\n", + "ID_9dad2eb09.dcm (666, 512)\n", + "ID_ea0ddbaf9.dcm (408, 374)\n", + "ID_d0c52575a.dcm (464, 464)\n", + "ID_c64131283.dcm (768, 768)\n", + "ID_f0d55b727.dcm (462, 462)\n", + "ID_1690a6499.dcm (436, 436)\n", + "ID_a2e178cc7.dcm (436, 436)\n", + "ID_950a06268.dcm (638, 490)\n", + "ID_cec3997fa.dcm (638, 490)\n", + "ID_3cb1b59bc.dcm (430, 404)\n", + "ID_3c8b72361.dcm (638, 490)\n", + "ID_ae1689e1b.dcm (436, 436)\n", + "ID_394ffb5fd.dcm (408, 374)\n", + "ID_fdbfb2c17.dcm (666, 512)\n", + "ID_94463e98f.dcm (462, 462)\n", + "ID_80a2dbc4a.dcm (768, 768)\n", + "ID_3274f5977.dcm (666, 512)\n", + "ID_f23f8e617.dcm (768, 768)\n", + "ID_8fd6d5047.dcm (768, 768)\n", + "ID_9b297fa83.dcm (408, 374)\n", + "ID_19f266244.dcm (638, 490)\n", + "ID_8caa68ebd.dcm (408, 374)\n", + "ID_079945c27.dcm (666, 512)\n", + "ID_c60e34466.dcm (768, 768)\n", + "ID_d1afb9750.dcm (436, 436)\n", + "ID_1291d1943.dcm (464, 464)\n", + "ID_6b1a86148.dcm (638, 490)\n", + "ID_7e756c43b.dcm (638, 490)\n", + "ID_142f85eb8.dcm (464, 464)\n", + "ID_d7229490a.dcm (462, 462)\n", + "ID_17103c79e.dcm (430, 404)\n", + "ID_8756b0c04.dcm (768, 768)\n", + "ID_28d6a694f.dcm (408, 374)\n", + "ID_8dc299456.dcm (638, 490)\n", + "ID_61c646098.dcm (436, 436)\n", + "ID_9a3bba619.dcm (430, 404)\n", + "ID_de10fdac2.dcm (768, 768)\n", + "ID_b19f52c76.dcm (436, 436)\n", + "ID_25de55880.dcm (462, 462)\n", + "ID_b494c2115.dcm (430, 404)\n", + "ID_a1bb9bc26.dcm (464, 464)\n", + "ID_2b3671dd9.dcm (666, 512)\n", + "ID_f1fe5334e.dcm (464, 464)\n", + "ID_3d5d23058.dcm (464, 464)\n", + "ID_21053fe7e.dcm (430, 404)\n", + "ID_b12bb2b16.dcm (638, 490)\n", + "ID_15b3ba199.dcm (462, 462)\n", + "ID_c2738e8b1.dcm (436, 436)\n", + "ID_0de0ab1d8.dcm (768, 768)\n", + "ID_038f966b9.dcm (408, 374)\n", + "ID_184c541fa.dcm (768, 768)\n", + "ID_5bf2ca43f.dcm (666, 512)\n", + "ID_ae691dd29.dcm (638, 490)\n", + "ID_a3feeadf4.dcm (408, 374)\n", + "ID_c6f2d84be.dcm (464, 464)\n", + "ID_176e4f16d.dcm (638, 490)\n", + "ID_6c19c9f7b.dcm (666, 512)\n", + "ID_291edd834.dcm (768, 768)\n", + "ID_b76de950b.dcm (436, 436)\n", + "ID_6d7a27643.dcm (462, 462)\n", + "ID_19306ecc5.dcm (464, 464)\n", + "ID_6a939bc17.dcm (408, 374)\n", + "ID_8144c7120.dcm (430, 404)\n", + "ID_b8665a653.dcm (436, 436)\n", + "ID_d7777de78.dcm (430, 404)\n", + "ID_1e633cf27.dcm (464, 464)\n", + "ID_cbbb50e6d.dcm (666, 512)\n", + "ID_6dcedd2e1.dcm (430, 404)\n", + "ID_c07d2cb73.dcm (638, 490)\n", + "ID_76f88846f.dcm (768, 768)\n", + "ID_b194d2a23.dcm (462, 462)\n", + "ID_f03370d7c.dcm (638, 490)\n", + "ID_7714ead69.dcm (666, 512)\n", + "ID_53f460f86.dcm (768, 768)\n", + "ID_631f0b556.dcm (638, 490)\n", + "ID_7917d368d.dcm (768, 768)\n", + "ID_11c4f9f91.dcm (462, 462)\n", + "ID_10fe2031e.dcm (408, 374)\n", + "ID_2fd4dda7c.dcm (408, 374)\n", + "ID_ff012ee5b.dcm (666, 512)\n", + "ID_985fb5e49.dcm (638, 490)\n", + "ID_a9ab8569f.dcm (408, 374)\n", + "ID_75e3f7e5a.dcm (638, 490)\n", + "ID_f145c3cf4.dcm (768, 768)\n", + "ID_9da128021.dcm (436, 436)\n", + "ID_155b9c546.dcm (436, 436)\n", + "ID_c51cbe76b.dcm (768, 768)\n", + "ID_fd5c41761.dcm (464, 464)\n", + "ID_d1a1c9a6c.dcm (436, 436)\n", + "ID_445a92ac2.dcm (638, 490)\n", + "ID_2ac7f01ed.dcm (408, 374)\n", + "ID_73dee8958.dcm (768, 768)\n", + "ID_830f46cad.dcm (408, 374)\n", + "ID_dfaa49f5c.dcm (462, 462)\n", + "ID_09aeb0bbd.dcm (462, 462)\n", + "ID_317330708.dcm (430, 404)\n", + "ID_88b0d8b4f.dcm (638, 490)\n", + "ID_3f422852d.dcm (666, 512)\n", + "ID_a3128aa77.dcm (408, 374)\n", + "ID_7a02fdbea.dcm (666, 512)\n", + "ID_4f0317d23.dcm (430, 404)\n", + "ID_57d6a6455.dcm (768, 768)\n", + "ID_56ecdf5c1.dcm (464, 464)\n", + "ID_abcd58e88.dcm (436, 436)\n", + "ID_362423b57.dcm (430, 404)\n", + "ID_66accd2e4.dcm (430, 404)\n", + "ID_97e5a203e.dcm (430, 404)\n", + "ID_798d956d0.dcm (638, 490)\n", + "ID_97cd49666.dcm (408, 374)\n", + "ID_5dbe845c1.dcm (436, 436)\n", + "ID_6f92e4481.dcm (462, 462)\n", + "ID_28c4609b3.dcm (462, 462)\n", + "ID_aef6c6df9.dcm (464, 464)\n", + "ID_3bc141392.dcm (408, 374)\n", + "ID_882cd57de.dcm (638, 490)\n", + "ID_91b9ce430.dcm (464, 464)\n", + "ID_fe7327fab.dcm (436, 436)\n", + "ID_a7e689932.dcm (638, 490)\n", + "ID_d1b2d9ad0.dcm (436, 436)\n", + "ID_75cbdae68.dcm (666, 512)\n", + "ID_3e60e696d.dcm (638, 490)\n", + "ID_4e61fb0b2.dcm (638, 490)\n", + "ID_dd3b5bf4e.dcm (430, 404)\n", + "ID_72dce7784.dcm (638, 490)\n", + "ID_c11582dc9.dcm (436, 436)\n", + "ID_6b15a7649.dcm (430, 404)\n", + "ID_04280250b.dcm (408, 374)\n", + "ID_cb970c6dc.dcm (462, 462)\n", + "ID_ea2861e9a.dcm (666, 512)\n", + "ID_dabc2a818.dcm (464, 464)\n", + "ID_4c9fb82af.dcm (436, 436)\n", + "ID_85900eb84.dcm (408, 374)\n", + "ID_d4ea87a35.dcm (430, 404)\n", + "ID_3ba8a116c.dcm (436, 436)\n", + "ID_8f5d4b696.dcm (430, 404)\n", + "ID_0c4987103.dcm (462, 462)\n", + "ID_0603b315e.dcm (408, 374)\n", + "ID_9ece1bb21.dcm (464, 464)\n", + "ID_c1ff9eb46.dcm (768, 768)\n", + "ID_c6bbec638.dcm (464, 464)\n", + "ID_845f922f4.dcm (436, 436)\n", + "ID_7940bb7d0.dcm (638, 490)\n", + "ID_c4575f13b.dcm (430, 404)\n", + "ID_f188940f9.dcm (638, 490)\n", + "ID_9b68c3f5f.dcm (436, 436)\n", + "ID_75d691728.dcm (638, 490)\n", + "ID_8fde47d9f.dcm (408, 374)\n", + "ID_155249efa.dcm (436, 436)\n", + "ID_6fbc30b5d.dcm (408, 374)\n", + "ID_d3b76ef6e.dcm (638, 490)\n", + "ID_6508563e0.dcm (464, 464)\n", + "ID_ac47ba810.dcm (638, 490)\n", + "ID_7c08b7fb7.dcm (430, 404)\n", + "ID_f4891876d.dcm (430, 404)\n", + "ID_b055aafa9.dcm (408, 374)\n", + "ID_22069463a.dcm (768, 768)\n", + "ID_081f4d071.dcm (408, 374)\n", + "ID_dd083e12a.dcm (638, 490)\n", + "ID_b1cea5abb.dcm (666, 512)\n", + "ID_e4b636907.dcm (666, 512)\n", + "ID_23d0b13b7.dcm (464, 464)\n", + "ID_ff9674e53.dcm (462, 462)\n", + "ID_36ab2e72a.dcm (436, 436)\n", + "ID_5ffae2e26.dcm (768, 768)\n", + "ID_6cc19ac41.dcm (408, 374)\n", + "ID_0b0e59911.dcm (430, 404)\n", + "ID_64b44f180.dcm (408, 374)\n", + "ID_ae7020fd1.dcm (638, 490)\n", + "ID_61d2718d2.dcm (462, 462)\n", + "ID_942e2f95b.dcm (638, 490)\n", + "ID_cade293be.dcm (462, 462)\n", + "ID_ab474037b.dcm (464, 464)\n", + "ID_d3fd5220e.dcm (408, 374)\n", + "ID_55f7bbbf2.dcm (666, 512)\n", + "ID_4e14d0fe8.dcm (666, 512)\n", + "ID_68e45bca7.dcm (768, 768)\n", + "ID_6cb797177.dcm (430, 404)\n", + "ID_7e870621c.dcm (462, 462)\n", + "ID_5ab140176.dcm (768, 768)\n", + "ID_8c5fc9e44.dcm (666, 512)\n", + "ID_3eb407dd8.dcm (464, 464)\n", + "ID_3d7a23dbb.dcm (436, 436)\n", + "ID_c6463f07d.dcm (462, 462)\n", + "ID_18aac96c0.dcm (430, 404)\n", + "ID_f698edc00.dcm (464, 464)\n", + "ID_b966185b8.dcm (462, 462)\n", + "ID_ca4a832a1.dcm (464, 464)\n", + "ID_60a1f0e24.dcm (462, 462)\n", + "ID_10f34fb10.dcm (638, 490)\n", + "ID_def2a0e9f.dcm (638, 490)\n", + "ID_f22730d7b.dcm (638, 490)\n", + "ID_12a0d6d34.dcm (430, 404)\n", + "ID_f4c2157d8.dcm (638, 490)\n", + "ID_0e1861e6d.dcm (638, 490)\n", + "ID_44d57858e.dcm (666, 512)\n", + "ID_be3fb6c17.dcm (436, 436)\n", + "ID_ae7b11865.dcm (430, 404)\n", + "ID_b9938c32c.dcm (436, 436)\n", + "ID_d77fa1286.dcm (462, 462)\n", + "ID_8a35660d5.dcm (430, 404)\n", + "ID_842e85173.dcm (462, 462)\n", + "ID_a880e377e.dcm (462, 462)\n" + ] + }, + { + "ename": "ValueError", + "evalue": "The length of the pixel data in the dataset (153710 bytes) doesn't match the expected length (524288 bytes). The dataset may be corrupted or there may be an issue with the pixel data handler.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-5-d37fcb90a2a6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mwriter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwriterow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdicom\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'corrupted'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mimg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpixel_array\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mimg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m512\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m512\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdicom\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mimg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda3/envs/py3x/lib/python3.7/site-packages/pydicom/dataset.py\u001b[0m in \u001b[0;36mpixel_array\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1360\u001b[0m \u001b[0mThe\u001b[0m \u001b[0mPixel\u001b[0m \u001b[0mData\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m7\u001b[0m\u001b[0mFE0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m00\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0ma\u001b[0m \u001b[0mNumPy\u001b[0m \u001b[0mndarray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1361\u001b[0m \"\"\"\n\u001b[0;32m-> 1362\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconvert_pixel_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1363\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pixel_array\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1364\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda3/envs/py3x/lib/python3.7/site-packages/pydicom/dataset.py\u001b[0m in \u001b[0;36mconvert_pixel_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1306\u001b[0m )\n\u001b[1;32m 1307\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1308\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mlast_exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1309\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1310\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdecompress\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda3/envs/py3x/lib/python3.7/site-packages/pydicom/dataset.py\u001b[0m in \u001b[0;36mconvert_pixel_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1274\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1275\u001b[0m \u001b[0;31m# Use the handler to get a 1D numpy array of the pixel data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1276\u001b[0;31m \u001b[0marr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhandler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_pixeldata\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1277\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pixel_array\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreshape_pixel_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1278\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda3/envs/py3x/lib/python3.7/site-packages/pydicom/pixel_data_handlers/numpy_handler.py\u001b[0m in \u001b[0;36mget_pixeldata\u001b[0;34m(ds, read_only)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[0;34m\"The dataset may be corrupted or there may be an issue \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0;34m\"with the pixel data handler.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 257\u001b[0;31m \u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mactual_length\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpadded_expected_len\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 258\u001b[0m )\n\u001b[1;32m 259\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mactual_length\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0mpadded_expected_len\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: The length of the pixel data in the dataset (153710 bytes) doesn't match the expected length (524288 bytes). The dataset may be corrupted or there may be an issue with the pixel data handler." + ] + } + ], + "source": [ + "with open('bad_actors.csv', 'w') as csv_file:\n", + " writer = csv.writer(csv_file, delimiter=',')\n", + " writer.writerow(['id', 'cause'])\n", + " for dicom in tqdm(os.listdir(kyle_data_path)):\n", + " try:\n", + " data = pydicom.dcmread(kyle_data_path+dicom)\n", + " except:\n", + " writer.writerow([dicom, 'corrupted'])\n", + " img = np.array(data.pixel_array, dtype=float)\n", + " if img.shape != (512, 512):\n", + " print(dicom, img.shape)\n", + " writer.writerow([dicom,img.shape])\n", + " \n", + "\n", + "\n", + "\n", + "# from multiprocessing.pool import ThreadPool as Pool\n", + "\n", + "# pool_size = 5 \n", + "# # define worker function before a Pool is instantiated\n", + "# def worker(dicom):\n", + "# try:\n", + "# data = pydicom.dcmread(kyle_data_path+dicom)\n", + "# img = np.array(data.pixel_array, dtype=float)\n", + "# except:\n", + "# writer.writerow([dicom, 'corrupted'])\n", + " \n", + "# if img.shape != (512, 512):\n", + "# print(dicom, img.shape)\n", + "# writer.writerow([dicom,img.shape])\n", + "\n", + "\n", + "# pool = Pool(pool_size)\n", + "\n", + "# with open('bad_actors.csv', 'w') as csv_file:\n", + "# writer = csv.writer(csv_file, delimiter=',')\n", + "# writer.writerow(['id', 'cause'])\n", + " \n", + "# for dicom in tqdm(os.listdir(kyle_data_path)):\n", + "# data = pool.apply_async(worker, (dicom,))\n", + "\n", + "# pool.close()\n", + "# pool.join()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "674258\n" + ] + } + ], + "source": [ + "print(len(set(bad_actors)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Helper Functions\n", + "\n", + "def translate_dicom(filename, path=kyle_data_path, apply_window=True):\n", + " \"\"\"\n", + " Transform a medical DICOM file to a standardized pixel based array\n", + " Arguments:\n", + " filename {string}\n", + " path {string} -- file path to data, set in config.ini\n", + " apply_window {bool} -- if True (default) then windowed png of dicom data is returned\n", + " \"\"\"\n", + " \n", + " data = pydicom.dcmread(path + filename)\n", + " \n", + " if apply_window:\n", + " window_center, window_width, intercept, slope = get_windowing(data)\n", + " img = window_image(data.pixel_array, window_center, window_width, intercept, slope)\n", + " return np.array(img, dtype=float)\n", + "\n", + " img = np.array(data.pixel_array, dtype=float)\n", + " standardized_array = np.divide(np.subtract(img,img.mean()),img.std())\n", + " return standardized_array\n", + "\n", + "def window_image(img, window_center, window_width, intercept, slope):\n", + " \"\"\"\n", + " Given a CT scan img apply a windowing to the image\n", + " Arguments:\n", + " img {np.array} -- array of a dicom img processed by pydicom.dcmread()\n", + " window_center,window_width,intercept,slope {floats} -- values provided by dicom file metadata\n", + " \"\"\"\n", + " img = (img * slope + intercept)\n", + " img_min = window_center - window_width // 2\n", + " img_max = window_center + window_width // 2\n", + " img[img < img_min] = img_min\n", + " img[img > img_max] = img_max\n", + " return img \n", + "\n", + "def get_first_of_dicom_field_as_int(x):\n", + " #get x[0] as in int is x is a 'pydicom.multival.MultiValue', otherwise get int(x)\n", + " if type(x) == pydicom.multival.MultiValue:\n", + " return int(x[0])\n", + " else:\n", + " return int(x)\n", + "\n", + "def get_windowing(data):\n", + " dicom_fields = [data.WindowCenter,\n", + " data.WindowWidth,\n", + " data.RescaleIntercept,\n", + " data.RescaleSlope]\n", + " return (get_first_of_dicom_field_as_int(x) for x in dicom_fields)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}