Switch to side-by-side view

--- a
+++ b/eda/hunt-for-bad-actors.ipynb
@@ -0,0 +1,533 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can read the DICOM metadata - especially \"Window Center\", \"Window Width\", \"Rescale Intercept\" and \"Rescale Slope\" - for data pre-processing.\n",
+    "Head CT are a center of 40 and a width of 80"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "from multiprocessing import Process\n",
+    "import concurrent.futures\n",
+    "import ast\n",
+    "import csv\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "import pydicom\n",
+    "import re\n",
+    "import PIL\n",
+    "from PIL import Image\n",
+    "from random import randrange\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "# import seaborn as sns\n",
+    "\n",
+    "train_csv_path = '../src/training.csv'\n",
+    "validate_csv_path = '../src/validation.csv'\n",
+    "test_csv_path = '../src/testing.csv'\n",
+    "\n",
+    "tony_data_path = ''\n",
+    "chris_data_path = ''\n",
+    "kyle_data_path = '/media/keil/baltar/intracranial-hemorrhage-detection-data/stage_1_train_images/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#dicoms to avoid\n",
+    "bad_actors = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9ea75e7e82e54a8d9a809ec1701e4a09",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=674258), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ID_dfa4e344a.dcm (638, 490)\n",
+      "ID_12e3b6923.dcm (430, 404)\n",
+      "ID_cef2af72d.dcm (638, 490)\n",
+      "ID_046ba342c.dcm (638, 490)\n",
+      "ID_680b2194c.dcm (666, 512)\n",
+      "ID_1bb3b44c7.dcm (638, 490)\n",
+      "ID_567a36143.dcm (436, 436)\n",
+      "ID_b4adf8739.dcm (462, 462)\n",
+      "ID_8d0ca7742.dcm (430, 404)\n",
+      "ID_db48a633d.dcm (666, 512)\n",
+      "ID_9cdc7295b.dcm (768, 768)\n",
+      "ID_c964e4096.dcm (666, 512)\n",
+      "ID_ca9462f49.dcm (462, 462)\n",
+      "ID_191369dca.dcm (408, 374)\n",
+      "ID_ac39010dc.dcm (430, 404)\n",
+      "ID_0e9ac1c5f.dcm (464, 464)\n",
+      "ID_d9840380c.dcm (464, 464)\n",
+      "ID_66131f4c9.dcm (408, 374)\n",
+      "ID_676b0cb59.dcm (768, 768)\n",
+      "ID_b76b13444.dcm (666, 512)\n",
+      "ID_898ff55b6.dcm (462, 462)\n",
+      "ID_37c495912.dcm (462, 462)\n",
+      "ID_a432727fd.dcm (464, 464)\n",
+      "ID_1bc5771a7.dcm (638, 490)\n",
+      "ID_69974dd3e.dcm (408, 374)\n",
+      "ID_3e31d57d0.dcm (666, 512)\n",
+      "ID_91c508c7a.dcm (462, 462)\n",
+      "ID_c35d5c858.dcm (430, 404)\n",
+      "ID_d6435f3bf.dcm (666, 512)\n",
+      "ID_b77ba3355.dcm (464, 464)\n",
+      "ID_a23a8193f.dcm (638, 490)\n",
+      "ID_9dad2eb09.dcm (666, 512)\n",
+      "ID_ea0ddbaf9.dcm (408, 374)\n",
+      "ID_d0c52575a.dcm (464, 464)\n",
+      "ID_c64131283.dcm (768, 768)\n",
+      "ID_f0d55b727.dcm (462, 462)\n",
+      "ID_1690a6499.dcm (436, 436)\n",
+      "ID_a2e178cc7.dcm (436, 436)\n",
+      "ID_950a06268.dcm (638, 490)\n",
+      "ID_cec3997fa.dcm (638, 490)\n",
+      "ID_3cb1b59bc.dcm (430, 404)\n",
+      "ID_3c8b72361.dcm (638, 490)\n",
+      "ID_ae1689e1b.dcm (436, 436)\n",
+      "ID_394ffb5fd.dcm (408, 374)\n",
+      "ID_fdbfb2c17.dcm (666, 512)\n",
+      "ID_94463e98f.dcm (462, 462)\n",
+      "ID_80a2dbc4a.dcm (768, 768)\n",
+      "ID_3274f5977.dcm (666, 512)\n",
+      "ID_f23f8e617.dcm (768, 768)\n",
+      "ID_8fd6d5047.dcm (768, 768)\n",
+      "ID_9b297fa83.dcm (408, 374)\n",
+      "ID_19f266244.dcm (638, 490)\n",
+      "ID_8caa68ebd.dcm (408, 374)\n",
+      "ID_079945c27.dcm (666, 512)\n",
+      "ID_c60e34466.dcm (768, 768)\n",
+      "ID_d1afb9750.dcm (436, 436)\n",
+      "ID_1291d1943.dcm (464, 464)\n",
+      "ID_6b1a86148.dcm (638, 490)\n",
+      "ID_7e756c43b.dcm (638, 490)\n",
+      "ID_142f85eb8.dcm (464, 464)\n",
+      "ID_d7229490a.dcm (462, 462)\n",
+      "ID_17103c79e.dcm (430, 404)\n",
+      "ID_8756b0c04.dcm (768, 768)\n",
+      "ID_28d6a694f.dcm (408, 374)\n",
+      "ID_8dc299456.dcm (638, 490)\n",
+      "ID_61c646098.dcm (436, 436)\n",
+      "ID_9a3bba619.dcm (430, 404)\n",
+      "ID_de10fdac2.dcm (768, 768)\n",
+      "ID_b19f52c76.dcm (436, 436)\n",
+      "ID_25de55880.dcm (462, 462)\n",
+      "ID_b494c2115.dcm (430, 404)\n",
+      "ID_a1bb9bc26.dcm (464, 464)\n",
+      "ID_2b3671dd9.dcm (666, 512)\n",
+      "ID_f1fe5334e.dcm (464, 464)\n",
+      "ID_3d5d23058.dcm (464, 464)\n",
+      "ID_21053fe7e.dcm (430, 404)\n",
+      "ID_b12bb2b16.dcm (638, 490)\n",
+      "ID_15b3ba199.dcm (462, 462)\n",
+      "ID_c2738e8b1.dcm (436, 436)\n",
+      "ID_0de0ab1d8.dcm (768, 768)\n",
+      "ID_038f966b9.dcm (408, 374)\n",
+      "ID_184c541fa.dcm (768, 768)\n",
+      "ID_5bf2ca43f.dcm (666, 512)\n",
+      "ID_ae691dd29.dcm (638, 490)\n",
+      "ID_a3feeadf4.dcm (408, 374)\n",
+      "ID_c6f2d84be.dcm (464, 464)\n",
+      "ID_176e4f16d.dcm (638, 490)\n",
+      "ID_6c19c9f7b.dcm (666, 512)\n",
+      "ID_291edd834.dcm (768, 768)\n",
+      "ID_b76de950b.dcm (436, 436)\n",
+      "ID_6d7a27643.dcm (462, 462)\n",
+      "ID_19306ecc5.dcm (464, 464)\n",
+      "ID_6a939bc17.dcm (408, 374)\n",
+      "ID_8144c7120.dcm (430, 404)\n",
+      "ID_b8665a653.dcm (436, 436)\n",
+      "ID_d7777de78.dcm (430, 404)\n",
+      "ID_1e633cf27.dcm (464, 464)\n",
+      "ID_cbbb50e6d.dcm (666, 512)\n",
+      "ID_6dcedd2e1.dcm (430, 404)\n",
+      "ID_c07d2cb73.dcm (638, 490)\n",
+      "ID_76f88846f.dcm (768, 768)\n",
+      "ID_b194d2a23.dcm (462, 462)\n",
+      "ID_f03370d7c.dcm (638, 490)\n",
+      "ID_7714ead69.dcm (666, 512)\n",
+      "ID_53f460f86.dcm (768, 768)\n",
+      "ID_631f0b556.dcm (638, 490)\n",
+      "ID_7917d368d.dcm (768, 768)\n",
+      "ID_11c4f9f91.dcm (462, 462)\n",
+      "ID_10fe2031e.dcm (408, 374)\n",
+      "ID_2fd4dda7c.dcm (408, 374)\n",
+      "ID_ff012ee5b.dcm (666, 512)\n",
+      "ID_985fb5e49.dcm (638, 490)\n",
+      "ID_a9ab8569f.dcm (408, 374)\n",
+      "ID_75e3f7e5a.dcm (638, 490)\n",
+      "ID_f145c3cf4.dcm (768, 768)\n",
+      "ID_9da128021.dcm (436, 436)\n",
+      "ID_155b9c546.dcm (436, 436)\n",
+      "ID_c51cbe76b.dcm (768, 768)\n",
+      "ID_fd5c41761.dcm (464, 464)\n",
+      "ID_d1a1c9a6c.dcm (436, 436)\n",
+      "ID_445a92ac2.dcm (638, 490)\n",
+      "ID_2ac7f01ed.dcm (408, 374)\n",
+      "ID_73dee8958.dcm (768, 768)\n",
+      "ID_830f46cad.dcm (408, 374)\n",
+      "ID_dfaa49f5c.dcm (462, 462)\n",
+      "ID_09aeb0bbd.dcm (462, 462)\n",
+      "ID_317330708.dcm (430, 404)\n",
+      "ID_88b0d8b4f.dcm (638, 490)\n",
+      "ID_3f422852d.dcm (666, 512)\n",
+      "ID_a3128aa77.dcm (408, 374)\n",
+      "ID_7a02fdbea.dcm (666, 512)\n",
+      "ID_4f0317d23.dcm (430, 404)\n",
+      "ID_57d6a6455.dcm (768, 768)\n",
+      "ID_56ecdf5c1.dcm (464, 464)\n",
+      "ID_abcd58e88.dcm (436, 436)\n",
+      "ID_362423b57.dcm (430, 404)\n",
+      "ID_66accd2e4.dcm (430, 404)\n",
+      "ID_97e5a203e.dcm (430, 404)\n",
+      "ID_798d956d0.dcm (638, 490)\n",
+      "ID_97cd49666.dcm (408, 374)\n",
+      "ID_5dbe845c1.dcm (436, 436)\n",
+      "ID_6f92e4481.dcm (462, 462)\n",
+      "ID_28c4609b3.dcm (462, 462)\n",
+      "ID_aef6c6df9.dcm (464, 464)\n",
+      "ID_3bc141392.dcm (408, 374)\n",
+      "ID_882cd57de.dcm (638, 490)\n",
+      "ID_91b9ce430.dcm (464, 464)\n",
+      "ID_fe7327fab.dcm (436, 436)\n",
+      "ID_a7e689932.dcm (638, 490)\n",
+      "ID_d1b2d9ad0.dcm (436, 436)\n",
+      "ID_75cbdae68.dcm (666, 512)\n",
+      "ID_3e60e696d.dcm (638, 490)\n",
+      "ID_4e61fb0b2.dcm (638, 490)\n",
+      "ID_dd3b5bf4e.dcm (430, 404)\n",
+      "ID_72dce7784.dcm (638, 490)\n",
+      "ID_c11582dc9.dcm (436, 436)\n",
+      "ID_6b15a7649.dcm (430, 404)\n",
+      "ID_04280250b.dcm (408, 374)\n",
+      "ID_cb970c6dc.dcm (462, 462)\n",
+      "ID_ea2861e9a.dcm (666, 512)\n",
+      "ID_dabc2a818.dcm (464, 464)\n",
+      "ID_4c9fb82af.dcm (436, 436)\n",
+      "ID_85900eb84.dcm (408, 374)\n",
+      "ID_d4ea87a35.dcm (430, 404)\n",
+      "ID_3ba8a116c.dcm (436, 436)\n",
+      "ID_8f5d4b696.dcm (430, 404)\n",
+      "ID_0c4987103.dcm (462, 462)\n",
+      "ID_0603b315e.dcm (408, 374)\n",
+      "ID_9ece1bb21.dcm (464, 464)\n",
+      "ID_c1ff9eb46.dcm (768, 768)\n",
+      "ID_c6bbec638.dcm (464, 464)\n",
+      "ID_845f922f4.dcm (436, 436)\n",
+      "ID_7940bb7d0.dcm (638, 490)\n",
+      "ID_c4575f13b.dcm (430, 404)\n",
+      "ID_f188940f9.dcm (638, 490)\n",
+      "ID_9b68c3f5f.dcm (436, 436)\n",
+      "ID_75d691728.dcm (638, 490)\n",
+      "ID_8fde47d9f.dcm (408, 374)\n",
+      "ID_155249efa.dcm (436, 436)\n",
+      "ID_6fbc30b5d.dcm (408, 374)\n",
+      "ID_d3b76ef6e.dcm (638, 490)\n",
+      "ID_6508563e0.dcm (464, 464)\n",
+      "ID_ac47ba810.dcm (638, 490)\n",
+      "ID_7c08b7fb7.dcm (430, 404)\n",
+      "ID_f4891876d.dcm (430, 404)\n",
+      "ID_b055aafa9.dcm (408, 374)\n",
+      "ID_22069463a.dcm (768, 768)\n",
+      "ID_081f4d071.dcm (408, 374)\n",
+      "ID_dd083e12a.dcm (638, 490)\n",
+      "ID_b1cea5abb.dcm (666, 512)\n",
+      "ID_e4b636907.dcm (666, 512)\n",
+      "ID_23d0b13b7.dcm (464, 464)\n",
+      "ID_ff9674e53.dcm (462, 462)\n",
+      "ID_36ab2e72a.dcm (436, 436)\n",
+      "ID_5ffae2e26.dcm (768, 768)\n",
+      "ID_6cc19ac41.dcm (408, 374)\n",
+      "ID_0b0e59911.dcm (430, 404)\n",
+      "ID_64b44f180.dcm (408, 374)\n",
+      "ID_ae7020fd1.dcm (638, 490)\n",
+      "ID_61d2718d2.dcm (462, 462)\n",
+      "ID_942e2f95b.dcm (638, 490)\n",
+      "ID_cade293be.dcm (462, 462)\n",
+      "ID_ab474037b.dcm (464, 464)\n",
+      "ID_d3fd5220e.dcm (408, 374)\n",
+      "ID_55f7bbbf2.dcm (666, 512)\n",
+      "ID_4e14d0fe8.dcm (666, 512)\n",
+      "ID_68e45bca7.dcm (768, 768)\n",
+      "ID_6cb797177.dcm (430, 404)\n",
+      "ID_7e870621c.dcm (462, 462)\n",
+      "ID_5ab140176.dcm (768, 768)\n",
+      "ID_8c5fc9e44.dcm (666, 512)\n",
+      "ID_3eb407dd8.dcm (464, 464)\n",
+      "ID_3d7a23dbb.dcm (436, 436)\n",
+      "ID_c6463f07d.dcm (462, 462)\n",
+      "ID_18aac96c0.dcm (430, 404)\n",
+      "ID_f698edc00.dcm (464, 464)\n",
+      "ID_b966185b8.dcm (462, 462)\n",
+      "ID_ca4a832a1.dcm (464, 464)\n",
+      "ID_60a1f0e24.dcm (462, 462)\n",
+      "ID_10f34fb10.dcm (638, 490)\n",
+      "ID_def2a0e9f.dcm (638, 490)\n",
+      "ID_f22730d7b.dcm (638, 490)\n",
+      "ID_12a0d6d34.dcm (430, 404)\n",
+      "ID_f4c2157d8.dcm (638, 490)\n",
+      "ID_0e1861e6d.dcm (638, 490)\n",
+      "ID_44d57858e.dcm (666, 512)\n",
+      "ID_be3fb6c17.dcm (436, 436)\n",
+      "ID_ae7b11865.dcm (430, 404)\n",
+      "ID_b9938c32c.dcm (436, 436)\n",
+      "ID_d77fa1286.dcm (462, 462)\n",
+      "ID_8a35660d5.dcm (430, 404)\n",
+      "ID_842e85173.dcm (462, 462)\n",
+      "ID_a880e377e.dcm (462, 462)\n"
+     ]
+    },
+    {
+     "ename": "ValueError",
+     "evalue": "The length of the pixel data in the dataset (153710 bytes) doesn't match the expected length (524288 bytes). The dataset may be corrupted or there may be an issue with the pixel data handler.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-5-d37fcb90a2a6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      7\u001b[0m         \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m             \u001b[0mwriter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwriterow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdicom\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'corrupted'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m         \u001b[0mimg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpixel_array\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     10\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mimg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m512\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m512\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     11\u001b[0m             \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdicom\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mimg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/miniconda3/envs/py3x/lib/python3.7/site-packages/pydicom/dataset.py\u001b[0m in \u001b[0;36mpixel_array\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1360\u001b[0m             \u001b[0mThe\u001b[0m \u001b[0mPixel\u001b[0m \u001b[0mData\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m7\u001b[0m\u001b[0mFE0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m00\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0ma\u001b[0m \u001b[0mNumPy\u001b[0m \u001b[0mndarray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1361\u001b[0m         \"\"\"\n\u001b[0;32m-> 1362\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconvert_pixel_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1363\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pixel_array\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1364\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/miniconda3/envs/py3x/lib/python3.7/site-packages/pydicom/dataset.py\u001b[0m in \u001b[0;36mconvert_pixel_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1306\u001b[0m         )\n\u001b[1;32m   1307\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1308\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mlast_exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1309\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1310\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mdecompress\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/miniconda3/envs/py3x/lib/python3.7/site-packages/pydicom/dataset.py\u001b[0m in \u001b[0;36mconvert_pixel_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1274\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1275\u001b[0m                 \u001b[0;31m# Use the handler to get a 1D numpy array of the pixel data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1276\u001b[0;31m                 \u001b[0marr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhandler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_pixeldata\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1277\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pixel_array\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreshape_pixel_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1278\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/miniconda3/envs/py3x/lib/python3.7/site-packages/pydicom/pixel_data_handlers/numpy_handler.py\u001b[0m in \u001b[0;36mget_pixeldata\u001b[0;34m(ds, read_only)\u001b[0m\n\u001b[1;32m    255\u001b[0m                 \u001b[0;34m\"The dataset may be corrupted or there may be an issue \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    256\u001b[0m                 \u001b[0;34m\"with the pixel data handler.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 257\u001b[0;31m                 \u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mactual_length\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpadded_expected_len\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    258\u001b[0m             )\n\u001b[1;32m    259\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0mactual_length\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0mpadded_expected_len\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mValueError\u001b[0m: The length of the pixel data in the dataset (153710 bytes) doesn't match the expected length (524288 bytes). The dataset may be corrupted or there may be an issue with the pixel data handler."
+     ]
+    }
+   ],
+   "source": [
+    "with open('bad_actors.csv', 'w') as csv_file:\n",
+    "    writer = csv.writer(csv_file, delimiter=',')\n",
+    "    writer.writerow(['id', 'cause'])\n",
+    "    for dicom in tqdm(os.listdir(kyle_data_path)):\n",
+    "        try:\n",
+    "            data = pydicom.dcmread(kyle_data_path+dicom)\n",
+    "        except:\n",
+    "            writer.writerow([dicom, 'corrupted'])\n",
+    "        img = np.array(data.pixel_array, dtype=float)\n",
+    "        if img.shape != (512, 512):\n",
+    "            print(dicom, img.shape)\n",
+    "            writer.writerow([dicom,img.shape])\n",
+    "    \n",
+    "\n",
+    "\n",
+    "\n",
+    "# from multiprocessing.pool import ThreadPool as Pool\n",
+    "\n",
+    "# pool_size = 5  \n",
+    "# # define worker function before a Pool is instantiated\n",
+    "# def worker(dicom):\n",
+    "#     try:\n",
+    "#         data = pydicom.dcmread(kyle_data_path+dicom)\n",
+    "#         img = np.array(data.pixel_array, dtype=float)\n",
+    "#     except:\n",
+    "#         writer.writerow([dicom, 'corrupted'])\n",
+    "    \n",
+    "#     if img.shape != (512, 512):\n",
+    "#         print(dicom, img.shape)\n",
+    "#         writer.writerow([dicom,img.shape])\n",
+    "\n",
+    "\n",
+    "# pool = Pool(pool_size)\n",
+    "\n",
+    "# with open('bad_actors.csv', 'w') as csv_file:\n",
+    "#     writer = csv.writer(csv_file, delimiter=',')\n",
+    "#     writer.writerow(['id', 'cause'])\n",
+    "    \n",
+    "#     for dicom in tqdm(os.listdir(kyle_data_path)):\n",
+    "#         data = pool.apply_async(worker, (dicom,))\n",
+    "\n",
+    "#     pool.close()\n",
+    "#     pool.join()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "674258\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(set(bad_actors)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Helper Functions\n",
+    "\n",
+    "def translate_dicom(filename, path=kyle_data_path, apply_window=True):\n",
+    "    \"\"\"\n",
+    "    Transform a medical DICOM file to a standardized pixel based array\n",
+    "    Arguments:\n",
+    "        filename {string}\n",
+    "        path {string} -- file path to data, set in config.ini\n",
+    "        apply_window {bool} -- if True (default) then windowed png of dicom data is returned\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    data = pydicom.dcmread(path + filename)\n",
+    "    \n",
+    "    if apply_window:\n",
+    "        window_center, window_width, intercept, slope = get_windowing(data)\n",
+    "        img = window_image(data.pixel_array, window_center, window_width, intercept, slope)\n",
+    "        return np.array(img, dtype=float)\n",
+    "\n",
+    "    img = np.array(data.pixel_array, dtype=float)\n",
+    "    standardized_array = np.divide(np.subtract(img,img.mean()),img.std())\n",
+    "    return standardized_array\n",
+    "\n",
+    "def window_image(img, window_center, window_width, intercept, slope):\n",
+    "    \"\"\"\n",
+    "    Given a CT scan img apply a windowing to the image\n",
+    "    Arguments:\n",
+    "        img {np.array} -- array of a dicom img processed by pydicom.dcmread()\n",
+    "        window_center,window_width,intercept,slope {floats} -- values provided by dicom file metadata\n",
+    "    \"\"\"\n",
+    "    img = (img * slope + intercept)\n",
+    "    img_min = window_center - window_width // 2\n",
+    "    img_max = window_center + window_width // 2\n",
+    "    img[img < img_min] = img_min\n",
+    "    img[img > img_max] = img_max\n",
+    "    return img \n",
+    "\n",
+    "def get_first_of_dicom_field_as_int(x):\n",
+    "    #get x[0] as in int is x is a 'pydicom.multival.MultiValue', otherwise get int(x)\n",
+    "    if type(x) == pydicom.multival.MultiValue:\n",
+    "        return int(x[0])\n",
+    "    else:\n",
+    "        return int(x)\n",
+    "\n",
+    "def get_windowing(data):\n",
+    "    dicom_fields = [data.WindowCenter,\n",
+    "                    data.WindowWidth,\n",
+    "                    data.RescaleIntercept,\n",
+    "                    data.RescaleSlope]\n",
+    "    return (get_first_of_dicom_field_as_int(x) for x in dicom_fields)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}