Switch to unified view

a b/0-preprocess-generate_csvs.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 1,
6
   "metadata": {},
7
   "outputs": [],
8
   "source": [
9
    "import pandas as pd\n",
10
    "from pathlib import Path\n",
11
    "from tqdm import tqdm_notebook\n",
12
    "import pydicom\n",
13
    "import itertools\n",
14
    "import numpy as np\n",
15
    "from concurrent.futures import ProcessPoolExecutor"
16
   ]
17
  },
18
  {
19
   "cell_type": "code",
20
   "execution_count": 2,
21
   "metadata": {},
22
   "outputs": [],
23
   "source": [
24
    "%%capture\n",
25
    "from tqdm import tqdm_notebook as tqdm\n",
26
    "tqdm().pandas()"
27
   ]
28
  },
29
  {
30
   "cell_type": "markdown",
31
   "metadata": {},
32
   "source": [
33
    "# Read stage_X_train and split id/label"
34
   ]
35
  },
36
  {
37
   "cell_type": "code",
38
   "execution_count": 3,
39
   "metadata": {},
40
   "outputs": [],
41
   "source": [
42
    "stage = \"stage_2\""
43
   ]
44
  },
45
  {
46
   "cell_type": "code",
47
   "execution_count": 4,
48
   "metadata": {},
49
   "outputs": [],
50
   "source": [
51
    "bad_dcm_fn = f'data/unzip/{stage}_train_images/ID_6431af929.dcm'"
52
   ]
53
  },
54
  {
55
   "cell_type": "code",
56
   "execution_count": 5,
57
   "metadata": {},
58
   "outputs": [
59
    {
60
     "name": "stdout",
61
     "output_type": "stream",
62
     "text": [
63
      "rm: cannot remove 'data/unzip/stage_2_train_images/ID_6431af929.dcm': No such file or directory\r\n"
64
     ]
65
    }
66
   ],
67
   "source": [
68
    "!rm {bad_dcm_fn}"
69
   ]
70
  },
71
  {
72
   "cell_type": "code",
73
   "execution_count": 6,
74
   "metadata": {},
75
   "outputs": [],
76
   "source": [
77
    "df_train = pd.read_csv(f'data/unzip/{stage}_train.csv')"
78
   ]
79
  },
80
  {
81
   "cell_type": "code",
82
   "execution_count": 7,
83
   "metadata": {},
84
   "outputs": [
85
    {
86
     "data": {
87
      "text/html": [
88
       "<div>\n",
89
       "<style scoped>\n",
90
       "    .dataframe tbody tr th:only-of-type {\n",
91
       "        vertical-align: middle;\n",
92
       "    }\n",
93
       "\n",
94
       "    .dataframe tbody tr th {\n",
95
       "        vertical-align: top;\n",
96
       "    }\n",
97
       "\n",
98
       "    .dataframe thead th {\n",
99
       "        text-align: right;\n",
100
       "    }\n",
101
       "</style>\n",
102
       "<table border=\"1\" class=\"dataframe\">\n",
103
       "  <thead>\n",
104
       "    <tr style=\"text-align: right;\">\n",
105
       "      <th></th>\n",
106
       "      <th>ID</th>\n",
107
       "      <th>Label</th>\n",
108
       "    </tr>\n",
109
       "  </thead>\n",
110
       "  <tbody>\n",
111
       "    <tr>\n",
112
       "      <th>0</th>\n",
113
       "      <td>ID_12cadc6af_epidural</td>\n",
114
       "      <td>0</td>\n",
115
       "    </tr>\n",
116
       "    <tr>\n",
117
       "      <th>1</th>\n",
118
       "      <td>ID_12cadc6af_intraparenchymal</td>\n",
119
       "      <td>0</td>\n",
120
       "    </tr>\n",
121
       "    <tr>\n",
122
       "      <th>2</th>\n",
123
       "      <td>ID_12cadc6af_intraventricular</td>\n",
124
       "      <td>0</td>\n",
125
       "    </tr>\n",
126
       "    <tr>\n",
127
       "      <th>3</th>\n",
128
       "      <td>ID_12cadc6af_subarachnoid</td>\n",
129
       "      <td>0</td>\n",
130
       "    </tr>\n",
131
       "    <tr>\n",
132
       "      <th>4</th>\n",
133
       "      <td>ID_12cadc6af_subdural</td>\n",
134
       "      <td>0</td>\n",
135
       "    </tr>\n",
136
       "  </tbody>\n",
137
       "</table>\n",
138
       "</div>"
139
      ],
140
      "text/plain": [
141
       "                              ID  Label\n",
142
       "0          ID_12cadc6af_epidural      0\n",
143
       "1  ID_12cadc6af_intraparenchymal      0\n",
144
       "2  ID_12cadc6af_intraventricular      0\n",
145
       "3      ID_12cadc6af_subarachnoid      0\n",
146
       "4          ID_12cadc6af_subdural      0"
147
      ]
148
     },
149
     "execution_count": 7,
150
     "metadata": {},
151
     "output_type": "execute_result"
152
    }
153
   ],
154
   "source": [
155
    "df_train.head()"
156
   ]
157
  },
158
  {
159
   "cell_type": "code",
160
   "execution_count": 8,
161
   "metadata": {},
162
   "outputs": [],
163
   "source": [
164
    "df_train['fid'] = df_train.ID.apply(lambda x: '_'.join(x.split('_')[:2]) )"
165
   ]
166
  },
167
  {
168
   "cell_type": "code",
169
   "execution_count": 9,
170
   "metadata": {},
171
   "outputs": [],
172
   "source": [
173
    "df_train.columns = ['ID', 'probability', 'fid']"
174
   ]
175
  },
176
  {
177
   "cell_type": "code",
178
   "execution_count": 10,
179
   "metadata": {},
180
   "outputs": [],
181
   "source": [
182
    "df_train['label'] = df_train.ID.apply(lambda x: x.split('_')[-1])"
183
   ]
184
  },
185
  {
186
   "cell_type": "code",
187
   "execution_count": 11,
188
   "metadata": {},
189
   "outputs": [
190
    {
191
     "data": {
192
      "text/html": [
193
       "<div>\n",
194
       "<style scoped>\n",
195
       "    .dataframe tbody tr th:only-of-type {\n",
196
       "        vertical-align: middle;\n",
197
       "    }\n",
198
       "\n",
199
       "    .dataframe tbody tr th {\n",
200
       "        vertical-align: top;\n",
201
       "    }\n",
202
       "\n",
203
       "    .dataframe thead th {\n",
204
       "        text-align: right;\n",
205
       "    }\n",
206
       "</style>\n",
207
       "<table border=\"1\" class=\"dataframe\">\n",
208
       "  <thead>\n",
209
       "    <tr style=\"text-align: right;\">\n",
210
       "      <th></th>\n",
211
       "      <th>ID</th>\n",
212
       "      <th>probability</th>\n",
213
       "      <th>fid</th>\n",
214
       "      <th>label</th>\n",
215
       "    </tr>\n",
216
       "  </thead>\n",
217
       "  <tbody>\n",
218
       "    <tr>\n",
219
       "      <th>0</th>\n",
220
       "      <td>ID_12cadc6af_epidural</td>\n",
221
       "      <td>0</td>\n",
222
       "      <td>ID_12cadc6af</td>\n",
223
       "      <td>epidural</td>\n",
224
       "    </tr>\n",
225
       "    <tr>\n",
226
       "      <th>1</th>\n",
227
       "      <td>ID_12cadc6af_intraparenchymal</td>\n",
228
       "      <td>0</td>\n",
229
       "      <td>ID_12cadc6af</td>\n",
230
       "      <td>intraparenchymal</td>\n",
231
       "    </tr>\n",
232
       "    <tr>\n",
233
       "      <th>2</th>\n",
234
       "      <td>ID_12cadc6af_intraventricular</td>\n",
235
       "      <td>0</td>\n",
236
       "      <td>ID_12cadc6af</td>\n",
237
       "      <td>intraventricular</td>\n",
238
       "    </tr>\n",
239
       "    <tr>\n",
240
       "      <th>3</th>\n",
241
       "      <td>ID_12cadc6af_subarachnoid</td>\n",
242
       "      <td>0</td>\n",
243
       "      <td>ID_12cadc6af</td>\n",
244
       "      <td>subarachnoid</td>\n",
245
       "    </tr>\n",
246
       "    <tr>\n",
247
       "      <th>4</th>\n",
248
       "      <td>ID_12cadc6af_subdural</td>\n",
249
       "      <td>0</td>\n",
250
       "      <td>ID_12cadc6af</td>\n",
251
       "      <td>subdural</td>\n",
252
       "    </tr>\n",
253
       "  </tbody>\n",
254
       "</table>\n",
255
       "</div>"
256
      ],
257
      "text/plain": [
258
       "                              ID  probability           fid             label\n",
259
       "0          ID_12cadc6af_epidural            0  ID_12cadc6af          epidural\n",
260
       "1  ID_12cadc6af_intraparenchymal            0  ID_12cadc6af  intraparenchymal\n",
261
       "2  ID_12cadc6af_intraventricular            0  ID_12cadc6af  intraventricular\n",
262
       "3      ID_12cadc6af_subarachnoid            0  ID_12cadc6af      subarachnoid\n",
263
       "4          ID_12cadc6af_subdural            0  ID_12cadc6af          subdural"
264
      ]
265
     },
266
     "execution_count": 11,
267
     "metadata": {},
268
     "output_type": "execute_result"
269
    }
270
   ],
271
   "source": [
272
    "df_train.head()"
273
   ]
274
  },
275
  {
276
   "cell_type": "markdown",
277
   "metadata": {},
278
   "source": [
279
    "# Remove dupes "
280
   ]
281
  },
282
  {
283
   "cell_type": "code",
284
   "execution_count": 12,
285
   "metadata": {},
286
   "outputs": [
287
    {
288
     "data": {
289
      "text/plain": [
290
       "(4516842, 4)"
291
      ]
292
     },
293
     "execution_count": 12,
294
     "metadata": {},
295
     "output_type": "execute_result"
296
    }
297
   ],
298
   "source": [
299
    "df_train.shape"
300
   ]
301
  },
302
  {
303
   "cell_type": "code",
304
   "execution_count": 13,
305
   "metadata": {},
306
   "outputs": [],
307
   "source": [
308
    "df_train.drop_duplicates('ID', inplace=True)"
309
   ]
310
  },
311
  {
312
   "cell_type": "code",
313
   "execution_count": 14,
314
   "metadata": {},
315
   "outputs": [
316
    {
317
     "data": {
318
      "text/plain": [
319
       "(4516818, 4)"
320
      ]
321
     },
322
     "execution_count": 14,
323
     "metadata": {},
324
     "output_type": "execute_result"
325
    }
326
   ],
327
   "source": [
328
    "df_train.shape"
329
   ]
330
  },
331
  {
332
   "cell_type": "markdown",
333
   "metadata": {},
334
   "source": [
335
    "# Remove corrupted image"
336
   ]
337
  },
338
  {
339
   "cell_type": "code",
340
   "execution_count": 15,
341
   "metadata": {},
342
   "outputs": [],
343
   "source": [
344
    "df_train = df_train[df_train.fid != 'ID_6431af929'] # ID_6431af929"
345
   ]
346
  },
347
  {
348
   "cell_type": "code",
349
   "execution_count": 16,
350
   "metadata": {},
351
   "outputs": [
352
    {
353
     "data": {
354
      "text/plain": [
355
       "(4516812, 4)"
356
      ]
357
     },
358
     "execution_count": 16,
359
     "metadata": {},
360
     "output_type": "execute_result"
361
    }
362
   ],
363
   "source": [
364
    "df_train.shape"
365
   ]
366
  },
367
  {
368
   "cell_type": "markdown",
369
   "metadata": {},
370
   "source": [
371
    "# Create pivot table with diagnostic labels as columns\n",
372
    "Generates:\n",
373
    "* `train_diags.csv` (previously named `train_pivot.csv`)"
374
   ]
375
  },
376
  {
377
   "cell_type": "code",
378
   "execution_count": 17,
379
   "metadata": {},
380
   "outputs": [],
381
   "source": [
382
    "df_diags = df_train.pivot(index='fid', columns='label', values='probability')"
383
   ]
384
  },
385
  {
386
   "cell_type": "code",
387
   "execution_count": 18,
388
   "metadata": {},
389
   "outputs": [
390
    {
391
     "data": {
392
      "text/html": [
393
       "<div>\n",
394
       "<style scoped>\n",
395
       "    .dataframe tbody tr th:only-of-type {\n",
396
       "        vertical-align: middle;\n",
397
       "    }\n",
398
       "\n",
399
       "    .dataframe tbody tr th {\n",
400
       "        vertical-align: top;\n",
401
       "    }\n",
402
       "\n",
403
       "    .dataframe thead th {\n",
404
       "        text-align: right;\n",
405
       "    }\n",
406
       "</style>\n",
407
       "<table border=\"1\" class=\"dataframe\">\n",
408
       "  <thead>\n",
409
       "    <tr style=\"text-align: right;\">\n",
410
       "      <th>label</th>\n",
411
       "      <th>any</th>\n",
412
       "      <th>epidural</th>\n",
413
       "      <th>intraparenchymal</th>\n",
414
       "      <th>intraventricular</th>\n",
415
       "      <th>subarachnoid</th>\n",
416
       "      <th>subdural</th>\n",
417
       "    </tr>\n",
418
       "    <tr>\n",
419
       "      <th>fid</th>\n",
420
       "      <th></th>\n",
421
       "      <th></th>\n",
422
       "      <th></th>\n",
423
       "      <th></th>\n",
424
       "      <th></th>\n",
425
       "      <th></th>\n",
426
       "    </tr>\n",
427
       "  </thead>\n",
428
       "  <tbody>\n",
429
       "    <tr>\n",
430
       "      <th>ID_000012eaf</th>\n",
431
       "      <td>0</td>\n",
432
       "      <td>0</td>\n",
433
       "      <td>0</td>\n",
434
       "      <td>0</td>\n",
435
       "      <td>0</td>\n",
436
       "      <td>0</td>\n",
437
       "    </tr>\n",
438
       "    <tr>\n",
439
       "      <th>ID_000039fa0</th>\n",
440
       "      <td>0</td>\n",
441
       "      <td>0</td>\n",
442
       "      <td>0</td>\n",
443
       "      <td>0</td>\n",
444
       "      <td>0</td>\n",
445
       "      <td>0</td>\n",
446
       "    </tr>\n",
447
       "    <tr>\n",
448
       "      <th>ID_00005679d</th>\n",
449
       "      <td>0</td>\n",
450
       "      <td>0</td>\n",
451
       "      <td>0</td>\n",
452
       "      <td>0</td>\n",
453
       "      <td>0</td>\n",
454
       "      <td>0</td>\n",
455
       "    </tr>\n",
456
       "    <tr>\n",
457
       "      <th>ID_00008ce3c</th>\n",
458
       "      <td>0</td>\n",
459
       "      <td>0</td>\n",
460
       "      <td>0</td>\n",
461
       "      <td>0</td>\n",
462
       "      <td>0</td>\n",
463
       "      <td>0</td>\n",
464
       "    </tr>\n",
465
       "    <tr>\n",
466
       "      <th>ID_0000950d7</th>\n",
467
       "      <td>0</td>\n",
468
       "      <td>0</td>\n",
469
       "      <td>0</td>\n",
470
       "      <td>0</td>\n",
471
       "      <td>0</td>\n",
472
       "      <td>0</td>\n",
473
       "    </tr>\n",
474
       "  </tbody>\n",
475
       "</table>\n",
476
       "</div>"
477
      ],
478
      "text/plain": [
479
       "label         any  epidural  intraparenchymal  intraventricular  subarachnoid  \\\n",
480
       "fid                                                                             \n",
481
       "ID_000012eaf    0         0                 0                 0             0   \n",
482
       "ID_000039fa0    0         0                 0                 0             0   \n",
483
       "ID_00005679d    0         0                 0                 0             0   \n",
484
       "ID_00008ce3c    0         0                 0                 0             0   \n",
485
       "ID_0000950d7    0         0                 0                 0             0   \n",
486
       "\n",
487
       "label         subdural  \n",
488
       "fid                     \n",
489
       "ID_000012eaf         0  \n",
490
       "ID_000039fa0         0  \n",
491
       "ID_00005679d         0  \n",
492
       "ID_00008ce3c         0  \n",
493
       "ID_0000950d7         0  "
494
      ]
495
     },
496
     "execution_count": 18,
497
     "metadata": {},
498
     "output_type": "execute_result"
499
    }
500
   ],
501
   "source": [
502
    "df_diags.head()"
503
   ]
504
  },
505
  {
506
   "cell_type": "code",
507
   "execution_count": 19,
508
   "metadata": {},
509
   "outputs": [
510
    {
511
     "data": {
512
      "text/plain": [
513
       "(752802, 6)"
514
      ]
515
     },
516
     "execution_count": 19,
517
     "metadata": {},
518
     "output_type": "execute_result"
519
    }
520
   ],
521
   "source": [
522
    "df_diags.shape"
523
   ]
524
  },
525
  {
526
   "cell_type": "code",
527
   "execution_count": 20,
528
   "metadata": {},
529
   "outputs": [],
530
   "source": [
531
    "df_diags.reset_index(inplace=True)"
532
   ]
533
  },
534
  {
535
   "cell_type": "code",
536
   "execution_count": 21,
537
   "metadata": {},
538
   "outputs": [
539
    {
540
     "data": {
541
      "text/html": [
542
       "<div>\n",
543
       "<style scoped>\n",
544
       "    .dataframe tbody tr th:only-of-type {\n",
545
       "        vertical-align: middle;\n",
546
       "    }\n",
547
       "\n",
548
       "    .dataframe tbody tr th {\n",
549
       "        vertical-align: top;\n",
550
       "    }\n",
551
       "\n",
552
       "    .dataframe thead th {\n",
553
       "        text-align: right;\n",
554
       "    }\n",
555
       "</style>\n",
556
       "<table border=\"1\" class=\"dataframe\">\n",
557
       "  <thead>\n",
558
       "    <tr style=\"text-align: right;\">\n",
559
       "      <th>label</th>\n",
560
       "      <th>fid</th>\n",
561
       "      <th>any</th>\n",
562
       "      <th>epidural</th>\n",
563
       "      <th>intraparenchymal</th>\n",
564
       "      <th>intraventricular</th>\n",
565
       "      <th>subarachnoid</th>\n",
566
       "      <th>subdural</th>\n",
567
       "    </tr>\n",
568
       "  </thead>\n",
569
       "  <tbody>\n",
570
       "    <tr>\n",
571
       "      <th>0</th>\n",
572
       "      <td>ID_000012eaf</td>\n",
573
       "      <td>0</td>\n",
574
       "      <td>0</td>\n",
575
       "      <td>0</td>\n",
576
       "      <td>0</td>\n",
577
       "      <td>0</td>\n",
578
       "      <td>0</td>\n",
579
       "    </tr>\n",
580
       "    <tr>\n",
581
       "      <th>1</th>\n",
582
       "      <td>ID_000039fa0</td>\n",
583
       "      <td>0</td>\n",
584
       "      <td>0</td>\n",
585
       "      <td>0</td>\n",
586
       "      <td>0</td>\n",
587
       "      <td>0</td>\n",
588
       "      <td>0</td>\n",
589
       "    </tr>\n",
590
       "    <tr>\n",
591
       "      <th>2</th>\n",
592
       "      <td>ID_00005679d</td>\n",
593
       "      <td>0</td>\n",
594
       "      <td>0</td>\n",
595
       "      <td>0</td>\n",
596
       "      <td>0</td>\n",
597
       "      <td>0</td>\n",
598
       "      <td>0</td>\n",
599
       "    </tr>\n",
600
       "    <tr>\n",
601
       "      <th>3</th>\n",
602
       "      <td>ID_00008ce3c</td>\n",
603
       "      <td>0</td>\n",
604
       "      <td>0</td>\n",
605
       "      <td>0</td>\n",
606
       "      <td>0</td>\n",
607
       "      <td>0</td>\n",
608
       "      <td>0</td>\n",
609
       "    </tr>\n",
610
       "    <tr>\n",
611
       "      <th>4</th>\n",
612
       "      <td>ID_0000950d7</td>\n",
613
       "      <td>0</td>\n",
614
       "      <td>0</td>\n",
615
       "      <td>0</td>\n",
616
       "      <td>0</td>\n",
617
       "      <td>0</td>\n",
618
       "      <td>0</td>\n",
619
       "    </tr>\n",
620
       "  </tbody>\n",
621
       "</table>\n",
622
       "</div>"
623
      ],
624
      "text/plain": [
625
       "label           fid  any  epidural  intraparenchymal  intraventricular  \\\n",
626
       "0      ID_000012eaf    0         0                 0                 0   \n",
627
       "1      ID_000039fa0    0         0                 0                 0   \n",
628
       "2      ID_00005679d    0         0                 0                 0   \n",
629
       "3      ID_00008ce3c    0         0                 0                 0   \n",
630
       "4      ID_0000950d7    0         0                 0                 0   \n",
631
       "\n",
632
       "label  subarachnoid  subdural  \n",
633
       "0                 0         0  \n",
634
       "1                 0         0  \n",
635
       "2                 0         0  \n",
636
       "3                 0         0  \n",
637
       "4                 0         0  "
638
      ]
639
     },
640
     "execution_count": 21,
641
     "metadata": {},
642
     "output_type": "execute_result"
643
    }
644
   ],
645
   "source": [
646
    "df_diags.head()"
647
   ]
648
  },
649
  {
650
   "cell_type": "code",
651
   "execution_count": 22,
652
   "metadata": {},
653
   "outputs": [
654
    {
655
     "data": {
656
      "text/plain": [
657
       "(752802, 7)"
658
      ]
659
     },
660
     "execution_count": 22,
661
     "metadata": {},
662
     "output_type": "execute_result"
663
    }
664
   ],
665
   "source": [
666
    "df_diags.shape"
667
   ]
668
  },
669
  {
670
   "cell_type": "code",
671
   "execution_count": 23,
672
   "metadata": {},
673
   "outputs": [],
674
   "source": [
675
    "df_diags.to_csv(f'data/{stage}_train_diags.csv', index=False)"
676
   ]
677
  },
678
  {
679
   "cell_type": "markdown",
680
   "metadata": {},
681
   "source": [
682
    "# Generate fastai-ready csv image file (.png) -> labels\n",
683
    "This is needed for early experiments that worked with the .png dataset.\n",
684
    "\n",
685
    "Generates:\n",
686
    "* `train_labels_as_strings.csv`"
687
   ]
688
  },
689
  {
690
   "cell_type": "code",
691
   "execution_count": 24,
692
   "metadata": {},
693
   "outputs": [],
694
   "source": [
695
    "from collections import defaultdict\n",
696
    "\n",
697
    "d = defaultdict(list)\n",
698
    "for fid in df_train.fid.unique(): d[fid]\n",
699
    "\n",
700
    "for tup in df_train.itertuples():\n",
701
    "    if tup.probability: d[tup.fid].append(tup.label)"
702
   ]
703
  },
704
  {
705
   "cell_type": "code",
706
   "execution_count": 25,
707
   "metadata": {},
708
   "outputs": [],
709
   "source": [
710
    "ks, vs = [], []\n",
711
    "\n",
712
    "for k, v in d.items():\n",
713
    "    ks.append(k), vs.append(' '.join(v))"
714
   ]
715
  },
716
  {
717
   "cell_type": "code",
718
   "execution_count": 26,
719
   "metadata": {},
720
   "outputs": [],
721
   "source": [
722
    "fastai_df = pd.DataFrame(data={'fn': ks, 'labels': vs})"
723
   ]
724
  },
725
  {
726
   "cell_type": "code",
727
   "execution_count": 27,
728
   "metadata": {},
729
   "outputs": [
730
    {
731
     "data": {
732
      "text/plain": [
733
       "(752802, 2)"
734
      ]
735
     },
736
     "execution_count": 27,
737
     "metadata": {},
738
     "output_type": "execute_result"
739
    }
740
   ],
741
   "source": [
742
    "fastai_df.shape"
743
   ]
744
  },
745
  {
746
   "cell_type": "code",
747
   "execution_count": 28,
748
   "metadata": {},
749
   "outputs": [],
750
   "source": [
751
    "fastai_df['fn'] += '.png'"
752
   ]
753
  },
754
  {
755
   "cell_type": "code",
756
   "execution_count": 29,
757
   "metadata": {},
758
   "outputs": [
759
    {
760
     "data": {
761
      "text/html": [
762
       "<div>\n",
763
       "<style scoped>\n",
764
       "    .dataframe tbody tr th:only-of-type {\n",
765
       "        vertical-align: middle;\n",
766
       "    }\n",
767
       "\n",
768
       "    .dataframe tbody tr th {\n",
769
       "        vertical-align: top;\n",
770
       "    }\n",
771
       "\n",
772
       "    .dataframe thead th {\n",
773
       "        text-align: right;\n",
774
       "    }\n",
775
       "</style>\n",
776
       "<table border=\"1\" class=\"dataframe\">\n",
777
       "  <thead>\n",
778
       "    <tr style=\"text-align: right;\">\n",
779
       "      <th></th>\n",
780
       "      <th>fn</th>\n",
781
       "      <th>labels</th>\n",
782
       "    </tr>\n",
783
       "  </thead>\n",
784
       "  <tbody>\n",
785
       "    <tr>\n",
786
       "      <th>0</th>\n",
787
       "      <td>ID_12cadc6af.png</td>\n",
788
       "      <td></td>\n",
789
       "    </tr>\n",
790
       "    <tr>\n",
791
       "      <th>1</th>\n",
792
       "      <td>ID_38fd7baa0.png</td>\n",
793
       "      <td></td>\n",
794
       "    </tr>\n",
795
       "    <tr>\n",
796
       "      <th>2</th>\n",
797
       "      <td>ID_6c5d82413.png</td>\n",
798
       "      <td></td>\n",
799
       "    </tr>\n",
800
       "    <tr>\n",
801
       "      <th>3</th>\n",
802
       "      <td>ID_aec8e68b3.png</td>\n",
803
       "      <td>subarachnoid any</td>\n",
804
       "    </tr>\n",
805
       "    <tr>\n",
806
       "      <th>4</th>\n",
807
       "      <td>ID_4d9209c7c.png</td>\n",
808
       "      <td></td>\n",
809
       "    </tr>\n",
810
       "  </tbody>\n",
811
       "</table>\n",
812
       "</div>"
813
      ],
814
      "text/plain": [
815
       "                 fn            labels\n",
816
       "0  ID_12cadc6af.png                  \n",
817
       "1  ID_38fd7baa0.png                  \n",
818
       "2  ID_6c5d82413.png                  \n",
819
       "3  ID_aec8e68b3.png  subarachnoid any\n",
820
       "4  ID_4d9209c7c.png                  "
821
      ]
822
     },
823
     "execution_count": 29,
824
     "metadata": {},
825
     "output_type": "execute_result"
826
    }
827
   ],
828
   "source": [
829
    "fastai_df.head()"
830
   ]
831
  },
832
  {
833
   "cell_type": "code",
834
   "execution_count": 30,
835
   "metadata": {},
836
   "outputs": [],
837
   "source": [
838
    "fastai_df.to_csv(f'data/{stage}_train_labels_as_strings.csv', index=False)"
839
   ]
840
  },
841
  {
842
   "cell_type": "markdown",
843
   "metadata": {},
844
   "source": [
845
    "# Tabulate dicom data\n",
846
    "Generates:\n",
847
    "* `train_dicom.csv`\n",
848
    "* `test_dicom.csv`"
849
   ]
850
  },
851
  {
852
   "cell_type": "code",
853
   "execution_count": 31,
854
   "metadata": {},
855
   "outputs": [],
856
   "source": [
857
    "cols_i_want = ['BitsAllocated', 'BitsStored', 'Columns', 'HighBit', 'ImageOrientationPatient', \n",
858
    "               'ImagePositionPatient', 'Modality', 'PatientID', 'PhotometricInterpretation', \n",
859
    "               'PixelRepresentation', 'PixelSpacing', 'RescaleIntercept', 'RescaleSlope', \n",
860
    "               'Rows', 'SOPInstanceUID', 'SamplesPerPixel', 'SeriesInstanceUID', 'StudyID', \n",
861
    "               'StudyInstanceUID', 'WindowCenter', 'WindowWidth']\n",
862
    "useless_cols = [ 'PixelData' ]"
863
   ]
864
  },
865
  {
866
   "cell_type": "code",
867
   "execution_count": 32,
868
   "metadata": {},
869
   "outputs": [],
870
   "source": [
871
    "train_dcm_list = list(Path(f'data/unzip/{stage}_train_images').glob('*.dcm'))"
872
   ]
873
  },
874
  {
875
   "cell_type": "code",
876
   "execution_count": 33,
877
   "metadata": {},
878
   "outputs": [],
879
   "source": [
880
    "f = train_dcm_list[0]"
881
   ]
882
  },
883
  {
884
   "cell_type": "code",
885
   "execution_count": 34,
886
   "metadata": {},
887
   "outputs": [
888
    {
889
     "data": {
890
      "text/plain": [
891
       "(0008, 0018) SOP Instance UID                    UI: ID_000012eaf\n",
892
       "(0008, 0060) Modality                            CS: 'CT'\n",
893
       "(0010, 0020) Patient ID                          LO: 'ID_f15c0eee'\n",
894
       "(0020, 000d) Study Instance UID                  UI: ID_30ea2b02d4\n",
895
       "(0020, 000e) Series Instance UID                 UI: ID_0ab5820b2a\n",
896
       "(0020, 0010) Study ID                            SH: ''\n",
897
       "(0020, 0032) Image Position (Patient)            DS: ['-125.000000', '-115.897980', '77.970825']\n",
898
       "(0020, 0037) Image Orientation (Patient)         DS: ['1.000000', '0.000000', '0.000000', '0.000000', '0.927184', '-0.374607']\n",
899
       "(0028, 0002) Samples per Pixel                   US: 1\n",
900
       "(0028, 0004) Photometric Interpretation          CS: 'MONOCHROME2'\n",
901
       "(0028, 0010) Rows                                US: 512\n",
902
       "(0028, 0011) Columns                             US: 512\n",
903
       "(0028, 0030) Pixel Spacing                       DS: ['0.488281', '0.488281']\n",
904
       "(0028, 0100) Bits Allocated                      US: 16\n",
905
       "(0028, 0101) Bits Stored                         US: 16\n",
906
       "(0028, 0102) High Bit                            US: 15\n",
907
       "(0028, 0103) Pixel Representation                US: 1\n",
908
       "(0028, 1050) Window Center                       DS: \"30\"\n",
909
       "(0028, 1051) Window Width                        DS: \"80\"\n",
910
       "(0028, 1052) Rescale Intercept                   DS: \"-1024\"\n",
911
       "(0028, 1053) Rescale Slope                       DS: \"1\"\n",
912
       "(7fe0, 0010) Pixel Data                          OW: Array of 524288 elements"
913
      ]
914
     },
915
     "execution_count": 34,
916
     "metadata": {},
917
     "output_type": "execute_result"
918
    }
919
   ],
920
   "source": [
921
    "dicom = pydicom.dcmread(str(f))\n",
922
    "dicom"
923
   ]
924
  },
925
  {
926
   "cell_type": "code",
927
   "execution_count": 35,
928
   "metadata": {},
929
   "outputs": [
930
    {
931
     "data": {
932
      "text/plain": [
933
       "'ID_f15c0eee'"
934
      ]
935
     },
936
     "execution_count": 35,
937
     "metadata": {},
938
     "output_type": "execute_result"
939
    }
940
   ],
941
   "source": [
942
    "dicom.data_element('PatientID').value"
943
   ]
944
  },
945
  {
946
   "cell_type": "code",
947
   "execution_count": 36,
948
   "metadata": {},
949
   "outputs": [
950
    {
951
     "data": {
952
      "text/plain": [
953
       "['-125.000000', '-115.897980', '77.970825']"
954
      ]
955
     },
956
     "execution_count": 36,
957
     "metadata": {},
958
     "output_type": "execute_result"
959
    }
960
   ],
961
   "source": [
962
    "ipp = dicom.data_element('ImagePositionPatient').value\n",
963
    "ipp"
964
   ]
965
  },
966
  {
967
   "cell_type": "code",
968
   "execution_count": 37,
969
   "metadata": {},
970
   "outputs": [
971
    {
972
     "data": {
973
      "text/plain": [
974
       "\"-125.000000\""
975
      ]
976
     },
977
     "execution_count": 37,
978
     "metadata": {},
979
     "output_type": "execute_result"
980
    }
981
   ],
982
   "source": [
983
    "ipp[0]"
984
   ]
985
  },
986
  {
987
   "cell_type": "code",
988
   "execution_count": 38,
989
   "metadata": {},
990
   "outputs": [
991
    {
992
     "data": {
993
      "text/plain": [
994
       "pydicom.multival.MultiValue"
995
      ]
996
     },
997
     "execution_count": 38,
998
     "metadata": {},
999
     "output_type": "execute_result"
1000
    }
1001
   ],
1002
   "source": [
1003
    "type(ipp)"
1004
   ]
1005
  },
1006
  {
1007
   "cell_type": "code",
1008
   "execution_count": 39,
1009
   "metadata": {},
1010
   "outputs": [],
1011
   "source": [
1012
    "def tabulate_dicom_data(file_list):\n",
1013
    "    l = []\n",
1014
    "    for f in file_list:\n",
1015
    "        dicom = pydicom.dcmread(str(f))\n",
1016
    "        d = {}\n",
1017
    "        for s in cols_i_want:\n",
1018
    "            v = dicom.data_element(s).value\n",
1019
    "            if isinstance(v, pydicom.multival.MultiValue):\n",
1020
    "                for i in range(len(v)):\n",
1021
    "                    d[s + '_' + str(i)] = v[i]\n",
1022
    "            else:\n",
1023
    "                d[s] = v\n",
1024
    "        l.append(d)\n",
1025
    "        \n",
1026
    "    return l\n"
1027
   ]
1028
  },
1029
  {
1030
   "cell_type": "code",
1031
   "execution_count": 40,
1032
   "metadata": {},
1033
   "outputs": [],
1034
   "source": [
1035
    "with ProcessPoolExecutor(max_workers=32) as e:\n",
1036
    "     l = list(itertools.chain.from_iterable(e.map(tabulate_dicom_data, np.array_split(train_dcm_list, 32))))"
1037
   ]
1038
  },
1039
  {
1040
   "cell_type": "code",
1041
   "execution_count": 41,
1042
   "metadata": {},
1043
   "outputs": [],
1044
   "source": [
1045
    "df_train_dicom = pd.DataFrame(l)"
1046
   ]
1047
  },
1048
  {
1049
   "cell_type": "code",
1050
   "execution_count": 42,
1051
   "metadata": {},
1052
   "outputs": [
1053
    {
1054
     "data": {
1055
      "text/html": [
1056
       "<div>\n",
1057
       "<style scoped>\n",
1058
       "    .dataframe tbody tr th:only-of-type {\n",
1059
       "        vertical-align: middle;\n",
1060
       "    }\n",
1061
       "\n",
1062
       "    .dataframe tbody tr th {\n",
1063
       "        vertical-align: top;\n",
1064
       "    }\n",
1065
       "\n",
1066
       "    .dataframe thead th {\n",
1067
       "        text-align: right;\n",
1068
       "    }\n",
1069
       "</style>\n",
1070
       "<table border=\"1\" class=\"dataframe\">\n",
1071
       "  <thead>\n",
1072
       "    <tr style=\"text-align: right;\">\n",
1073
       "      <th></th>\n",
1074
       "      <th>BitsAllocated</th>\n",
1075
       "      <th>BitsStored</th>\n",
1076
       "      <th>Columns</th>\n",
1077
       "      <th>HighBit</th>\n",
1078
       "      <th>ImageOrientationPatient_0</th>\n",
1079
       "      <th>ImageOrientationPatient_1</th>\n",
1080
       "      <th>ImageOrientationPatient_2</th>\n",
1081
       "      <th>ImageOrientationPatient_3</th>\n",
1082
       "      <th>ImageOrientationPatient_4</th>\n",
1083
       "      <th>ImageOrientationPatient_5</th>\n",
1084
       "      <th>...</th>\n",
1085
       "      <th>SamplesPerPixel</th>\n",
1086
       "      <th>SeriesInstanceUID</th>\n",
1087
       "      <th>StudyID</th>\n",
1088
       "      <th>StudyInstanceUID</th>\n",
1089
       "      <th>WindowCenter</th>\n",
1090
       "      <th>WindowCenter_0</th>\n",
1091
       "      <th>WindowCenter_1</th>\n",
1092
       "      <th>WindowWidth</th>\n",
1093
       "      <th>WindowWidth_0</th>\n",
1094
       "      <th>WindowWidth_1</th>\n",
1095
       "    </tr>\n",
1096
       "  </thead>\n",
1097
       "  <tbody>\n",
1098
       "    <tr>\n",
1099
       "      <th>0</th>\n",
1100
       "      <td>16</td>\n",
1101
       "      <td>16</td>\n",
1102
       "      <td>512</td>\n",
1103
       "      <td>15</td>\n",
1104
       "      <td>1.0</td>\n",
1105
       "      <td>0.0</td>\n",
1106
       "      <td>0.0</td>\n",
1107
       "      <td>0.0</td>\n",
1108
       "      <td>0.927184</td>\n",
1109
       "      <td>-0.374607</td>\n",
1110
       "      <td>...</td>\n",
1111
       "      <td>1</td>\n",
1112
       "      <td>ID_0ab5820b2a</td>\n",
1113
       "      <td></td>\n",
1114
       "      <td>ID_30ea2b02d4</td>\n",
1115
       "      <td>30.0</td>\n",
1116
       "      <td>NaN</td>\n",
1117
       "      <td>NaN</td>\n",
1118
       "      <td>80.0</td>\n",
1119
       "      <td>NaN</td>\n",
1120
       "      <td>NaN</td>\n",
1121
       "    </tr>\n",
1122
       "    <tr>\n",
1123
       "      <th>1</th>\n",
1124
       "      <td>16</td>\n",
1125
       "      <td>16</td>\n",
1126
       "      <td>512</td>\n",
1127
       "      <td>15</td>\n",
1128
       "      <td>1.0</td>\n",
1129
       "      <td>0.0</td>\n",
1130
       "      <td>0.0</td>\n",
1131
       "      <td>0.0</td>\n",
1132
       "      <td>0.968148</td>\n",
1133
       "      <td>-0.250380</td>\n",
1134
       "      <td>...</td>\n",
1135
       "      <td>1</td>\n",
1136
       "      <td>ID_5f8484c3e0</td>\n",
1137
       "      <td></td>\n",
1138
       "      <td>ID_134d398b61</td>\n",
1139
       "      <td>30.0</td>\n",
1140
       "      <td>NaN</td>\n",
1141
       "      <td>NaN</td>\n",
1142
       "      <td>80.0</td>\n",
1143
       "      <td>NaN</td>\n",
1144
       "      <td>NaN</td>\n",
1145
       "    </tr>\n",
1146
       "    <tr>\n",
1147
       "      <th>2</th>\n",
1148
       "      <td>16</td>\n",
1149
       "      <td>16</td>\n",
1150
       "      <td>512</td>\n",
1151
       "      <td>15</td>\n",
1152
       "      <td>1.0</td>\n",
1153
       "      <td>0.0</td>\n",
1154
       "      <td>0.0</td>\n",
1155
       "      <td>0.0</td>\n",
1156
       "      <td>1.000000</td>\n",
1157
       "      <td>0.000000</td>\n",
1158
       "      <td>...</td>\n",
1159
       "      <td>1</td>\n",
1160
       "      <td>ID_203cd6ec46</td>\n",
1161
       "      <td></td>\n",
1162
       "      <td>ID_b5c26cda09</td>\n",
1163
       "      <td>50.0</td>\n",
1164
       "      <td>NaN</td>\n",
1165
       "      <td>NaN</td>\n",
1166
       "      <td>100.0</td>\n",
1167
       "      <td>NaN</td>\n",
1168
       "      <td>NaN</td>\n",
1169
       "    </tr>\n",
1170
       "    <tr>\n",
1171
       "      <th>3</th>\n",
1172
       "      <td>16</td>\n",
1173
       "      <td>12</td>\n",
1174
       "      <td>512</td>\n",
1175
       "      <td>11</td>\n",
1176
       "      <td>1.0</td>\n",
1177
       "      <td>0.0</td>\n",
1178
       "      <td>0.0</td>\n",
1179
       "      <td>0.0</td>\n",
1180
       "      <td>0.994522</td>\n",
1181
       "      <td>0.104528</td>\n",
1182
       "      <td>...</td>\n",
1183
       "      <td>1</td>\n",
1184
       "      <td>ID_3780d48b28</td>\n",
1185
       "      <td></td>\n",
1186
       "      <td>ID_974735bf79</td>\n",
1187
       "      <td>NaN</td>\n",
1188
       "      <td>40.0</td>\n",
1189
       "      <td>40.0</td>\n",
1190
       "      <td>NaN</td>\n",
1191
       "      <td>80.0</td>\n",
1192
       "      <td>80.0</td>\n",
1193
       "    </tr>\n",
1194
       "    <tr>\n",
1195
       "      <th>4</th>\n",
1196
       "      <td>16</td>\n",
1197
       "      <td>16</td>\n",
1198
       "      <td>512</td>\n",
1199
       "      <td>15</td>\n",
1200
       "      <td>1.0</td>\n",
1201
       "      <td>0.0</td>\n",
1202
       "      <td>0.0</td>\n",
1203
       "      <td>0.0</td>\n",
1204
       "      <td>1.000000</td>\n",
1205
       "      <td>0.000000</td>\n",
1206
       "      <td>...</td>\n",
1207
       "      <td>1</td>\n",
1208
       "      <td>ID_84296c3845</td>\n",
1209
       "      <td></td>\n",
1210
       "      <td>ID_8881b1c4b1</td>\n",
1211
       "      <td>35.0</td>\n",
1212
       "      <td>NaN</td>\n",
1213
       "      <td>NaN</td>\n",
1214
       "      <td>135.0</td>\n",
1215
       "      <td>NaN</td>\n",
1216
       "      <td>NaN</td>\n",
1217
       "    </tr>\n",
1218
       "  </tbody>\n",
1219
       "</table>\n",
1220
       "<p>5 rows × 33 columns</p>\n",
1221
       "</div>"
1222
      ],
1223
      "text/plain": [
1224
       "   BitsAllocated  BitsStored  Columns  HighBit  ImageOrientationPatient_0  \\\n",
1225
       "0             16          16      512       15                        1.0   \n",
1226
       "1             16          16      512       15                        1.0   \n",
1227
       "2             16          16      512       15                        1.0   \n",
1228
       "3             16          12      512       11                        1.0   \n",
1229
       "4             16          16      512       15                        1.0   \n",
1230
       "\n",
1231
       "   ImageOrientationPatient_1  ImageOrientationPatient_2  \\\n",
1232
       "0                        0.0                        0.0   \n",
1233
       "1                        0.0                        0.0   \n",
1234
       "2                        0.0                        0.0   \n",
1235
       "3                        0.0                        0.0   \n",
1236
       "4                        0.0                        0.0   \n",
1237
       "\n",
1238
       "   ImageOrientationPatient_3  ImageOrientationPatient_4  \\\n",
1239
       "0                        0.0                   0.927184   \n",
1240
       "1                        0.0                   0.968148   \n",
1241
       "2                        0.0                   1.000000   \n",
1242
       "3                        0.0                   0.994522   \n",
1243
       "4                        0.0                   1.000000   \n",
1244
       "\n",
1245
       "   ImageOrientationPatient_5  ...  SamplesPerPixel  SeriesInstanceUID  \\\n",
1246
       "0                  -0.374607  ...                1      ID_0ab5820b2a   \n",
1247
       "1                  -0.250380  ...                1      ID_5f8484c3e0   \n",
1248
       "2                   0.000000  ...                1      ID_203cd6ec46   \n",
1249
       "3                   0.104528  ...                1      ID_3780d48b28   \n",
1250
       "4                   0.000000  ...                1      ID_84296c3845   \n",
1251
       "\n",
1252
       "   StudyID StudyInstanceUID WindowCenter WindowCenter_0  WindowCenter_1  \\\n",
1253
       "0             ID_30ea2b02d4         30.0            NaN             NaN   \n",
1254
       "1             ID_134d398b61         30.0            NaN             NaN   \n",
1255
       "2             ID_b5c26cda09         50.0            NaN             NaN   \n",
1256
       "3             ID_974735bf79          NaN           40.0            40.0   \n",
1257
       "4             ID_8881b1c4b1         35.0            NaN             NaN   \n",
1258
       "\n",
1259
       "   WindowWidth  WindowWidth_0  WindowWidth_1  \n",
1260
       "0         80.0            NaN            NaN  \n",
1261
       "1         80.0            NaN            NaN  \n",
1262
       "2        100.0            NaN            NaN  \n",
1263
       "3          NaN           80.0           80.0  \n",
1264
       "4        135.0            NaN            NaN  \n",
1265
       "\n",
1266
       "[5 rows x 33 columns]"
1267
      ]
1268
     },
1269
     "execution_count": 42,
1270
     "metadata": {},
1271
     "output_type": "execute_result"
1272
    }
1273
   ],
1274
   "source": [
1275
    "df_train_dicom.head()"
1276
   ]
1277
  },
1278
  {
1279
   "cell_type": "code",
1280
   "execution_count": 43,
1281
   "metadata": {},
1282
   "outputs": [],
1283
   "source": [
1284
    "df_train_dicom.to_csv(f'data/{stage}_train_dicom.csv')"
1285
   ]
1286
  },
1287
  {
1288
   "cell_type": "code",
1289
   "execution_count": 44,
1290
   "metadata": {},
1291
   "outputs": [],
1292
   "source": [
1293
    "test_dcm_list = list(Path(f'data/unzip/{stage}_test_images').glob('*.dcm'))\n",
1294
    "with ProcessPoolExecutor(max_workers=32) as e:\n",
1295
    "     l = list(itertools.chain.from_iterable(e.map(tabulate_dicom_data, np.array_split(test_dcm_list, 32))))"
1296
   ]
1297
  },
1298
  {
1299
   "cell_type": "code",
1300
   "execution_count": 45,
1301
   "metadata": {},
1302
   "outputs": [
1303
    {
1304
     "data": {
1305
      "text/html": [
1306
       "<div>\n",
1307
       "<style scoped>\n",
1308
       "    .dataframe tbody tr th:only-of-type {\n",
1309
       "        vertical-align: middle;\n",
1310
       "    }\n",
1311
       "\n",
1312
       "    .dataframe tbody tr th {\n",
1313
       "        vertical-align: top;\n",
1314
       "    }\n",
1315
       "\n",
1316
       "    .dataframe thead th {\n",
1317
       "        text-align: right;\n",
1318
       "    }\n",
1319
       "</style>\n",
1320
       "<table border=\"1\" class=\"dataframe\">\n",
1321
       "  <thead>\n",
1322
       "    <tr style=\"text-align: right;\">\n",
1323
       "      <th></th>\n",
1324
       "      <th>BitsAllocated</th>\n",
1325
       "      <th>BitsStored</th>\n",
1326
       "      <th>Columns</th>\n",
1327
       "      <th>HighBit</th>\n",
1328
       "      <th>ImageOrientationPatient_0</th>\n",
1329
       "      <th>ImageOrientationPatient_1</th>\n",
1330
       "      <th>ImageOrientationPatient_2</th>\n",
1331
       "      <th>ImageOrientationPatient_3</th>\n",
1332
       "      <th>ImageOrientationPatient_4</th>\n",
1333
       "      <th>ImageOrientationPatient_5</th>\n",
1334
       "      <th>...</th>\n",
1335
       "      <th>SamplesPerPixel</th>\n",
1336
       "      <th>SeriesInstanceUID</th>\n",
1337
       "      <th>StudyID</th>\n",
1338
       "      <th>StudyInstanceUID</th>\n",
1339
       "      <th>WindowCenter</th>\n",
1340
       "      <th>WindowCenter_0</th>\n",
1341
       "      <th>WindowCenter_1</th>\n",
1342
       "      <th>WindowWidth</th>\n",
1343
       "      <th>WindowWidth_0</th>\n",
1344
       "      <th>WindowWidth_1</th>\n",
1345
       "    </tr>\n",
1346
       "  </thead>\n",
1347
       "  <tbody>\n",
1348
       "    <tr>\n",
1349
       "      <th>0</th>\n",
1350
       "      <td>16</td>\n",
1351
       "      <td>12</td>\n",
1352
       "      <td>512</td>\n",
1353
       "      <td>11</td>\n",
1354
       "      <td>1.0</td>\n",
1355
       "      <td>0.0</td>\n",
1356
       "      <td>0.0</td>\n",
1357
       "      <td>0.0</td>\n",
1358
       "      <td>0.981627</td>\n",
1359
       "      <td>-0.190809</td>\n",
1360
       "      <td>...</td>\n",
1361
       "      <td>1</td>\n",
1362
       "      <td>ID_4d28912ba6</td>\n",
1363
       "      <td></td>\n",
1364
       "      <td>ID_1f6d1e8aeb</td>\n",
1365
       "      <td>NaN</td>\n",
1366
       "      <td>40.0</td>\n",
1367
       "      <td>40.0</td>\n",
1368
       "      <td>NaN</td>\n",
1369
       "      <td>80.0</td>\n",
1370
       "      <td>80.0</td>\n",
1371
       "    </tr>\n",
1372
       "    <tr>\n",
1373
       "      <th>1</th>\n",
1374
       "      <td>16</td>\n",
1375
       "      <td>16</td>\n",
1376
       "      <td>512</td>\n",
1377
       "      <td>15</td>\n",
1378
       "      <td>1.0</td>\n",
1379
       "      <td>0.0</td>\n",
1380
       "      <td>0.0</td>\n",
1381
       "      <td>0.0</td>\n",
1382
       "      <td>0.987688</td>\n",
1383
       "      <td>-0.156434</td>\n",
1384
       "      <td>...</td>\n",
1385
       "      <td>1</td>\n",
1386
       "      <td>ID_acabdeee86</td>\n",
1387
       "      <td></td>\n",
1388
       "      <td>ID_4a8d7ec19f</td>\n",
1389
       "      <td>30.0</td>\n",
1390
       "      <td>NaN</td>\n",
1391
       "      <td>NaN</td>\n",
1392
       "      <td>80.0</td>\n",
1393
       "      <td>NaN</td>\n",
1394
       "      <td>NaN</td>\n",
1395
       "    </tr>\n",
1396
       "    <tr>\n",
1397
       "      <th>2</th>\n",
1398
       "      <td>16</td>\n",
1399
       "      <td>16</td>\n",
1400
       "      <td>512</td>\n",
1401
       "      <td>15</td>\n",
1402
       "      <td>1.0</td>\n",
1403
       "      <td>0.0</td>\n",
1404
       "      <td>0.0</td>\n",
1405
       "      <td>0.0</td>\n",
1406
       "      <td>0.927184</td>\n",
1407
       "      <td>-0.374607</td>\n",
1408
       "      <td>...</td>\n",
1409
       "      <td>1</td>\n",
1410
       "      <td>ID_d00cee7f0c</td>\n",
1411
       "      <td></td>\n",
1412
       "      <td>ID_a6ca244172</td>\n",
1413
       "      <td>30.0</td>\n",
1414
       "      <td>NaN</td>\n",
1415
       "      <td>NaN</td>\n",
1416
       "      <td>80.0</td>\n",
1417
       "      <td>NaN</td>\n",
1418
       "      <td>NaN</td>\n",
1419
       "    </tr>\n",
1420
       "    <tr>\n",
1421
       "      <th>3</th>\n",
1422
       "      <td>16</td>\n",
1423
       "      <td>16</td>\n",
1424
       "      <td>512</td>\n",
1425
       "      <td>15</td>\n",
1426
       "      <td>1.0</td>\n",
1427
       "      <td>0.0</td>\n",
1428
       "      <td>0.0</td>\n",
1429
       "      <td>0.0</td>\n",
1430
       "      <td>0.986286</td>\n",
1431
       "      <td>-0.165048</td>\n",
1432
       "      <td>...</td>\n",
1433
       "      <td>1</td>\n",
1434
       "      <td>ID_a52a0112d5</td>\n",
1435
       "      <td></td>\n",
1436
       "      <td>ID_fa950a03af</td>\n",
1437
       "      <td>30.0</td>\n",
1438
       "      <td>NaN</td>\n",
1439
       "      <td>NaN</td>\n",
1440
       "      <td>80.0</td>\n",
1441
       "      <td>NaN</td>\n",
1442
       "      <td>NaN</td>\n",
1443
       "    </tr>\n",
1444
       "    <tr>\n",
1445
       "      <th>4</th>\n",
1446
       "      <td>16</td>\n",
1447
       "      <td>12</td>\n",
1448
       "      <td>512</td>\n",
1449
       "      <td>11</td>\n",
1450
       "      <td>1.0</td>\n",
1451
       "      <td>0.0</td>\n",
1452
       "      <td>0.0</td>\n",
1453
       "      <td>0.0</td>\n",
1454
       "      <td>1.000000</td>\n",
1455
       "      <td>0.000000</td>\n",
1456
       "      <td>...</td>\n",
1457
       "      <td>1</td>\n",
1458
       "      <td>ID_f552d3b922</td>\n",
1459
       "      <td></td>\n",
1460
       "      <td>ID_965d8b3d8e</td>\n",
1461
       "      <td>NaN</td>\n",
1462
       "      <td>36.0</td>\n",
1463
       "      <td>36.0</td>\n",
1464
       "      <td>NaN</td>\n",
1465
       "      <td>80.0</td>\n",
1466
       "      <td>80.0</td>\n",
1467
       "    </tr>\n",
1468
       "  </tbody>\n",
1469
       "</table>\n",
1470
       "<p>5 rows × 33 columns</p>\n",
1471
       "</div>"
1472
      ],
1473
      "text/plain": [
1474
       "   BitsAllocated  BitsStored  Columns  HighBit  ImageOrientationPatient_0  \\\n",
1475
       "0             16          12      512       11                        1.0   \n",
1476
       "1             16          16      512       15                        1.0   \n",
1477
       "2             16          16      512       15                        1.0   \n",
1478
       "3             16          16      512       15                        1.0   \n",
1479
       "4             16          12      512       11                        1.0   \n",
1480
       "\n",
1481
       "   ImageOrientationPatient_1  ImageOrientationPatient_2  \\\n",
1482
       "0                        0.0                        0.0   \n",
1483
       "1                        0.0                        0.0   \n",
1484
       "2                        0.0                        0.0   \n",
1485
       "3                        0.0                        0.0   \n",
1486
       "4                        0.0                        0.0   \n",
1487
       "\n",
1488
       "   ImageOrientationPatient_3  ImageOrientationPatient_4  \\\n",
1489
       "0                        0.0                   0.981627   \n",
1490
       "1                        0.0                   0.987688   \n",
1491
       "2                        0.0                   0.927184   \n",
1492
       "3                        0.0                   0.986286   \n",
1493
       "4                        0.0                   1.000000   \n",
1494
       "\n",
1495
       "   ImageOrientationPatient_5  ...  SamplesPerPixel  SeriesInstanceUID  \\\n",
1496
       "0                  -0.190809  ...                1      ID_4d28912ba6   \n",
1497
       "1                  -0.156434  ...                1      ID_acabdeee86   \n",
1498
       "2                  -0.374607  ...                1      ID_d00cee7f0c   \n",
1499
       "3                  -0.165048  ...                1      ID_a52a0112d5   \n",
1500
       "4                   0.000000  ...                1      ID_f552d3b922   \n",
1501
       "\n",
1502
       "   StudyID StudyInstanceUID WindowCenter WindowCenter_0  WindowCenter_1  \\\n",
1503
       "0             ID_1f6d1e8aeb          NaN           40.0            40.0   \n",
1504
       "1             ID_4a8d7ec19f         30.0            NaN             NaN   \n",
1505
       "2             ID_a6ca244172         30.0            NaN             NaN   \n",
1506
       "3             ID_fa950a03af         30.0            NaN             NaN   \n",
1507
       "4             ID_965d8b3d8e          NaN           36.0            36.0   \n",
1508
       "\n",
1509
       "   WindowWidth  WindowWidth_0  WindowWidth_1  \n",
1510
       "0          NaN           80.0           80.0  \n",
1511
       "1         80.0            NaN            NaN  \n",
1512
       "2         80.0            NaN            NaN  \n",
1513
       "3         80.0            NaN            NaN  \n",
1514
       "4          NaN           80.0           80.0  \n",
1515
       "\n",
1516
       "[5 rows x 33 columns]"
1517
      ]
1518
     },
1519
     "execution_count": 45,
1520
     "metadata": {},
1521
     "output_type": "execute_result"
1522
    }
1523
   ],
1524
   "source": [
1525
    "df_test_dicom = pd.DataFrame(l)\n",
1526
    "df_test_dicom.head()"
1527
   ]
1528
  },
1529
  {
1530
   "cell_type": "code",
1531
   "execution_count": 46,
1532
   "metadata": {},
1533
   "outputs": [],
1534
   "source": [
1535
    "df_test_dicom.to_csv(f'data/{stage}_test_dicom.csv')"
1536
   ]
1537
  },
1538
  {
1539
   "cell_type": "markdown",
1540
   "metadata": {},
1541
   "source": [
1542
    "# Add labels to the train dicom csv\n",
1543
    "Generates:\n",
1544
    "* `train_dicom_diags.csv` (previously named `train_dicom_pivot.csv`)"
1545
   ]
1546
  },
1547
  {
1548
   "cell_type": "code",
1549
   "execution_count": 47,
1550
   "metadata": {},
1551
   "outputs": [],
1552
   "source": [
1553
    "df_train_dicom_diags = pd.merge(df_train_dicom, df_diags,  how='left', left_on=['SOPInstanceUID'], right_on = ['fid'])"
1554
   ]
1555
  },
1556
  {
1557
   "cell_type": "code",
1558
   "execution_count": 48,
1559
   "metadata": {},
1560
   "outputs": [],
1561
   "source": [
1562
    "assert len(df_train_dicom) == len(df_diags) == len(df_train_dicom_diags)"
1563
   ]
1564
  },
1565
  {
1566
   "cell_type": "code",
1567
   "execution_count": 49,
1568
   "metadata": {},
1569
   "outputs": [],
1570
   "source": [
1571
    "df_train_dicom_diags.to_csv(f'data/{stage}_train_dicom_diags.csv')"
1572
   ]
1573
  }
1574
 ],
1575
 "metadata": {
1576
  "kernelspec": {
1577
   "display_name": "Python 3",
1578
   "language": "python",
1579
   "name": "python3"
1580
  },
1581
  "language_info": {
1582
   "codemirror_mode": {
1583
    "name": "ipython",
1584
    "version": 3
1585
   },
1586
   "file_extension": ".py",
1587
   "mimetype": "text/x-python",
1588
   "name": "python",
1589
   "nbconvert_exporter": "python",
1590
   "pygments_lexer": "ipython3",
1591
   "version": "3.7.3"
1592
  }
1593
 },
1594
 "nbformat": 4,
1595
 "nbformat_minor": 2
1596
}