Diff of /2-preprocess-pickle.ipynb [000000] .. [fc9ccf]

Switch to unified view

a b/2-preprocess-pickle.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 1,
6
   "metadata": {},
7
   "outputs": [],
8
   "source": [
9
    "import pandas as pd\n",
10
    "from pathlib import Path\n",
11
    "from collections import defaultdict\n",
12
    "import pickle"
13
   ]
14
  },
15
  {
16
   "cell_type": "code",
17
   "execution_count": 2,
18
   "metadata": {},
19
   "outputs": [],
20
   "source": [
21
    "stage = \"stage_2\""
22
   ]
23
  },
24
  {
25
   "cell_type": "markdown",
26
   "metadata": {},
27
   "source": [
28
    "# Train dataset"
29
   ]
30
  },
31
  {
32
   "cell_type": "code",
33
   "execution_count": 3,
34
   "metadata": {},
35
   "outputs": [],
36
   "source": [
37
    "df = pd.read_csv(f\"data/{stage}_train_dicom_diags.csv\")"
38
   ]
39
  },
40
  {
41
   "cell_type": "code",
42
   "execution_count": 4,
43
   "metadata": {},
44
   "outputs": [
45
    {
46
     "data": {
47
      "text/html": [
48
       "<div>\n",
49
       "<style scoped>\n",
50
       "    .dataframe tbody tr th:only-of-type {\n",
51
       "        vertical-align: middle;\n",
52
       "    }\n",
53
       "\n",
54
       "    .dataframe tbody tr th {\n",
55
       "        vertical-align: top;\n",
56
       "    }\n",
57
       "\n",
58
       "    .dataframe thead th {\n",
59
       "        text-align: right;\n",
60
       "    }\n",
61
       "</style>\n",
62
       "<table border=\"1\" class=\"dataframe\">\n",
63
       "  <thead>\n",
64
       "    <tr style=\"text-align: right;\">\n",
65
       "      <th></th>\n",
66
       "      <th>Unnamed: 0</th>\n",
67
       "      <th>BitsAllocated</th>\n",
68
       "      <th>BitsStored</th>\n",
69
       "      <th>Columns</th>\n",
70
       "      <th>HighBit</th>\n",
71
       "      <th>ImageOrientationPatient_0</th>\n",
72
       "      <th>ImageOrientationPatient_1</th>\n",
73
       "      <th>ImageOrientationPatient_2</th>\n",
74
       "      <th>ImageOrientationPatient_3</th>\n",
75
       "      <th>ImageOrientationPatient_4</th>\n",
76
       "      <th>...</th>\n",
77
       "      <th>WindowWidth</th>\n",
78
       "      <th>WindowWidth_0</th>\n",
79
       "      <th>WindowWidth_1</th>\n",
80
       "      <th>fid</th>\n",
81
       "      <th>any</th>\n",
82
       "      <th>epidural</th>\n",
83
       "      <th>intraparenchymal</th>\n",
84
       "      <th>intraventricular</th>\n",
85
       "      <th>subarachnoid</th>\n",
86
       "      <th>subdural</th>\n",
87
       "    </tr>\n",
88
       "  </thead>\n",
89
       "  <tbody>\n",
90
       "    <tr>\n",
91
       "      <th>0</th>\n",
92
       "      <td>0</td>\n",
93
       "      <td>16</td>\n",
94
       "      <td>16</td>\n",
95
       "      <td>512</td>\n",
96
       "      <td>15</td>\n",
97
       "      <td>1.0</td>\n",
98
       "      <td>0.0</td>\n",
99
       "      <td>0.0</td>\n",
100
       "      <td>0.0</td>\n",
101
       "      <td>0.927184</td>\n",
102
       "      <td>...</td>\n",
103
       "      <td>80.0</td>\n",
104
       "      <td>NaN</td>\n",
105
       "      <td>NaN</td>\n",
106
       "      <td>ID_000012eaf</td>\n",
107
       "      <td>0</td>\n",
108
       "      <td>0</td>\n",
109
       "      <td>0</td>\n",
110
       "      <td>0</td>\n",
111
       "      <td>0</td>\n",
112
       "      <td>0</td>\n",
113
       "    </tr>\n",
114
       "    <tr>\n",
115
       "      <th>1</th>\n",
116
       "      <td>1</td>\n",
117
       "      <td>16</td>\n",
118
       "      <td>16</td>\n",
119
       "      <td>512</td>\n",
120
       "      <td>15</td>\n",
121
       "      <td>1.0</td>\n",
122
       "      <td>0.0</td>\n",
123
       "      <td>0.0</td>\n",
124
       "      <td>0.0</td>\n",
125
       "      <td>0.968148</td>\n",
126
       "      <td>...</td>\n",
127
       "      <td>80.0</td>\n",
128
       "      <td>NaN</td>\n",
129
       "      <td>NaN</td>\n",
130
       "      <td>ID_000039fa0</td>\n",
131
       "      <td>0</td>\n",
132
       "      <td>0</td>\n",
133
       "      <td>0</td>\n",
134
       "      <td>0</td>\n",
135
       "      <td>0</td>\n",
136
       "      <td>0</td>\n",
137
       "    </tr>\n",
138
       "    <tr>\n",
139
       "      <th>2</th>\n",
140
       "      <td>2</td>\n",
141
       "      <td>16</td>\n",
142
       "      <td>16</td>\n",
143
       "      <td>512</td>\n",
144
       "      <td>15</td>\n",
145
       "      <td>1.0</td>\n",
146
       "      <td>0.0</td>\n",
147
       "      <td>0.0</td>\n",
148
       "      <td>0.0</td>\n",
149
       "      <td>1.000000</td>\n",
150
       "      <td>...</td>\n",
151
       "      <td>100.0</td>\n",
152
       "      <td>NaN</td>\n",
153
       "      <td>NaN</td>\n",
154
       "      <td>ID_00005679d</td>\n",
155
       "      <td>0</td>\n",
156
       "      <td>0</td>\n",
157
       "      <td>0</td>\n",
158
       "      <td>0</td>\n",
159
       "      <td>0</td>\n",
160
       "      <td>0</td>\n",
161
       "    </tr>\n",
162
       "    <tr>\n",
163
       "      <th>3</th>\n",
164
       "      <td>3</td>\n",
165
       "      <td>16</td>\n",
166
       "      <td>12</td>\n",
167
       "      <td>512</td>\n",
168
       "      <td>11</td>\n",
169
       "      <td>1.0</td>\n",
170
       "      <td>0.0</td>\n",
171
       "      <td>0.0</td>\n",
172
       "      <td>0.0</td>\n",
173
       "      <td>0.994522</td>\n",
174
       "      <td>...</td>\n",
175
       "      <td>NaN</td>\n",
176
       "      <td>80.0</td>\n",
177
       "      <td>80.0</td>\n",
178
       "      <td>ID_00008ce3c</td>\n",
179
       "      <td>0</td>\n",
180
       "      <td>0</td>\n",
181
       "      <td>0</td>\n",
182
       "      <td>0</td>\n",
183
       "      <td>0</td>\n",
184
       "      <td>0</td>\n",
185
       "    </tr>\n",
186
       "    <tr>\n",
187
       "      <th>4</th>\n",
188
       "      <td>4</td>\n",
189
       "      <td>16</td>\n",
190
       "      <td>16</td>\n",
191
       "      <td>512</td>\n",
192
       "      <td>15</td>\n",
193
       "      <td>1.0</td>\n",
194
       "      <td>0.0</td>\n",
195
       "      <td>0.0</td>\n",
196
       "      <td>0.0</td>\n",
197
       "      <td>1.000000</td>\n",
198
       "      <td>...</td>\n",
199
       "      <td>135.0</td>\n",
200
       "      <td>NaN</td>\n",
201
       "      <td>NaN</td>\n",
202
       "      <td>ID_0000950d7</td>\n",
203
       "      <td>0</td>\n",
204
       "      <td>0</td>\n",
205
       "      <td>0</td>\n",
206
       "      <td>0</td>\n",
207
       "      <td>0</td>\n",
208
       "      <td>0</td>\n",
209
       "    </tr>\n",
210
       "  </tbody>\n",
211
       "</table>\n",
212
       "<p>5 rows × 41 columns</p>\n",
213
       "</div>"
214
      ],
215
      "text/plain": [
216
       "   Unnamed: 0  BitsAllocated  BitsStored  Columns  HighBit  \\\n",
217
       "0           0             16          16      512       15   \n",
218
       "1           1             16          16      512       15   \n",
219
       "2           2             16          16      512       15   \n",
220
       "3           3             16          12      512       11   \n",
221
       "4           4             16          16      512       15   \n",
222
       "\n",
223
       "   ImageOrientationPatient_0  ImageOrientationPatient_1  \\\n",
224
       "0                        1.0                        0.0   \n",
225
       "1                        1.0                        0.0   \n",
226
       "2                        1.0                        0.0   \n",
227
       "3                        1.0                        0.0   \n",
228
       "4                        1.0                        0.0   \n",
229
       "\n",
230
       "   ImageOrientationPatient_2  ImageOrientationPatient_3  \\\n",
231
       "0                        0.0                        0.0   \n",
232
       "1                        0.0                        0.0   \n",
233
       "2                        0.0                        0.0   \n",
234
       "3                        0.0                        0.0   \n",
235
       "4                        0.0                        0.0   \n",
236
       "\n",
237
       "   ImageOrientationPatient_4  ...  WindowWidth  WindowWidth_0  WindowWidth_1  \\\n",
238
       "0                   0.927184  ...         80.0            NaN            NaN   \n",
239
       "1                   0.968148  ...         80.0            NaN            NaN   \n",
240
       "2                   1.000000  ...        100.0            NaN            NaN   \n",
241
       "3                   0.994522  ...          NaN           80.0           80.0   \n",
242
       "4                   1.000000  ...        135.0            NaN            NaN   \n",
243
       "\n",
244
       "            fid any epidural intraparenchymal  intraventricular  subarachnoid  \\\n",
245
       "0  ID_000012eaf   0        0                0                 0             0   \n",
246
       "1  ID_000039fa0   0        0                0                 0             0   \n",
247
       "2  ID_00005679d   0        0                0                 0             0   \n",
248
       "3  ID_00008ce3c   0        0                0                 0             0   \n",
249
       "4  ID_0000950d7   0        0                0                 0             0   \n",
250
       "\n",
251
       "   subdural  \n",
252
       "0         0  \n",
253
       "1         0  \n",
254
       "2         0  \n",
255
       "3         0  \n",
256
       "4         0  \n",
257
       "\n",
258
       "[5 rows x 41 columns]"
259
      ]
260
     },
261
     "execution_count": 4,
262
     "metadata": {},
263
     "output_type": "execute_result"
264
    }
265
   ],
266
   "source": [
267
    "df.head()"
268
   ]
269
  },
270
  {
271
   "cell_type": "code",
272
   "execution_count": 5,
273
   "metadata": {},
274
   "outputs": [
275
    {
276
     "data": {
277
      "text/plain": [
278
       "21744"
279
      ]
280
     },
281
     "execution_count": 5,
282
     "metadata": {},
283
     "output_type": "execute_result"
284
    }
285
   ],
286
   "source": [
287
    "# sort, then group by (order is preserved within groups)\n",
288
    "gs = df.sort_values('ImagePositionPatient_2').groupby('SeriesInstanceUID')\n",
289
    "len(gs)"
290
   ]
291
  },
292
  {
293
   "cell_type": "code",
294
   "execution_count": 6,
295
   "metadata": {},
296
   "outputs": [
297
    {
298
     "data": {
299
      "text/html": [
300
       "<div>\n",
301
       "<style scoped>\n",
302
       "    .dataframe tbody tr th:only-of-type {\n",
303
       "        vertical-align: middle;\n",
304
       "    }\n",
305
       "\n",
306
       "    .dataframe tbody tr th {\n",
307
       "        vertical-align: top;\n",
308
       "    }\n",
309
       "\n",
310
       "    .dataframe thead th {\n",
311
       "        text-align: right;\n",
312
       "    }\n",
313
       "</style>\n",
314
       "<table border=\"1\" class=\"dataframe\">\n",
315
       "  <thead>\n",
316
       "    <tr style=\"text-align: right;\">\n",
317
       "      <th></th>\n",
318
       "      <th>ImagePositionPatient_2</th>\n",
319
       "      <th>fid</th>\n",
320
       "    </tr>\n",
321
       "  </thead>\n",
322
       "  <tbody>\n",
323
       "    <tr>\n",
324
       "      <th>577964</th>\n",
325
       "      <td>193.542489</td>\n",
326
       "      <td>ID_c45659d3d</td>\n",
327
       "    </tr>\n",
328
       "    <tr>\n",
329
       "      <th>229790</th>\n",
330
       "      <td>198.214051</td>\n",
331
       "      <td>ID_4e0bdd2ba</td>\n",
332
       "    </tr>\n",
333
       "    <tr>\n",
334
       "      <th>22395</th>\n",
335
       "      <td>202.885613</td>\n",
336
       "      <td>ID_079945c27</td>\n",
337
       "    </tr>\n",
338
       "    <tr>\n",
339
       "      <th>746126</th>\n",
340
       "      <td>207.557174</td>\n",
341
       "      <td>ID_fdbfb2c17</td>\n",
342
       "    </tr>\n",
343
       "    <tr>\n",
344
       "      <th>253266</th>\n",
345
       "      <td>212.228736</td>\n",
346
       "      <td>ID_55f7bbbf2</td>\n",
347
       "    </tr>\n",
348
       "  </tbody>\n",
349
       "</table>\n",
350
       "</div>"
351
      ],
352
      "text/plain": [
353
       "        ImagePositionPatient_2           fid\n",
354
       "577964              193.542489  ID_c45659d3d\n",
355
       "229790              198.214051  ID_4e0bdd2ba\n",
356
       "22395               202.885613  ID_079945c27\n",
357
       "746126              207.557174  ID_fdbfb2c17\n",
358
       "253266              212.228736  ID_55f7bbbf2"
359
      ]
360
     },
361
     "execution_count": 6,
362
     "metadata": {},
363
     "output_type": "execute_result"
364
    }
365
   ],
366
   "source": [
367
    "# see if it worked\n",
368
    "gs.get_group('ID_fa19cd5ea9')[['ImagePositionPatient_2', 'fid']].head()"
369
   ]
370
  },
371
  {
372
   "cell_type": "code",
373
   "execution_count": 7,
374
   "metadata": {},
375
   "outputs": [],
376
   "source": [
377
    "g = gs.get_group('ID_fa19cd5ea9')"
378
   ]
379
  },
380
  {
381
   "cell_type": "code",
382
   "execution_count": 8,
383
   "metadata": {},
384
   "outputs": [],
385
   "source": [
386
    "subg = g[['SeriesInstanceUID', 'fid', 'any', 'epidural', \n",
387
    "          'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']]"
388
   ]
389
  },
390
  {
391
   "cell_type": "code",
392
   "execution_count": 9,
393
   "metadata": {},
394
   "outputs": [
395
    {
396
     "data": {
397
      "text/html": [
398
       "<div>\n",
399
       "<style scoped>\n",
400
       "    .dataframe tbody tr th:only-of-type {\n",
401
       "        vertical-align: middle;\n",
402
       "    }\n",
403
       "\n",
404
       "    .dataframe tbody tr th {\n",
405
       "        vertical-align: top;\n",
406
       "    }\n",
407
       "\n",
408
       "    .dataframe thead th {\n",
409
       "        text-align: right;\n",
410
       "    }\n",
411
       "</style>\n",
412
       "<table border=\"1\" class=\"dataframe\">\n",
413
       "  <thead>\n",
414
       "    <tr style=\"text-align: right;\">\n",
415
       "      <th></th>\n",
416
       "      <th>SeriesInstanceUID</th>\n",
417
       "      <th>fid</th>\n",
418
       "      <th>any</th>\n",
419
       "      <th>epidural</th>\n",
420
       "      <th>intraparenchymal</th>\n",
421
       "      <th>intraventricular</th>\n",
422
       "      <th>subarachnoid</th>\n",
423
       "      <th>subdural</th>\n",
424
       "    </tr>\n",
425
       "  </thead>\n",
426
       "  <tbody>\n",
427
       "    <tr>\n",
428
       "      <th>577964</th>\n",
429
       "      <td>ID_fa19cd5ea9</td>\n",
430
       "      <td>ID_c45659d3d</td>\n",
431
       "      <td>0</td>\n",
432
       "      <td>0</td>\n",
433
       "      <td>0</td>\n",
434
       "      <td>0</td>\n",
435
       "      <td>0</td>\n",
436
       "      <td>0</td>\n",
437
       "    </tr>\n",
438
       "    <tr>\n",
439
       "      <th>229790</th>\n",
440
       "      <td>ID_fa19cd5ea9</td>\n",
441
       "      <td>ID_4e0bdd2ba</td>\n",
442
       "      <td>0</td>\n",
443
       "      <td>0</td>\n",
444
       "      <td>0</td>\n",
445
       "      <td>0</td>\n",
446
       "      <td>0</td>\n",
447
       "      <td>0</td>\n",
448
       "    </tr>\n",
449
       "    <tr>\n",
450
       "      <th>22395</th>\n",
451
       "      <td>ID_fa19cd5ea9</td>\n",
452
       "      <td>ID_079945c27</td>\n",
453
       "      <td>1</td>\n",
454
       "      <td>0</td>\n",
455
       "      <td>0</td>\n",
456
       "      <td>0</td>\n",
457
       "      <td>0</td>\n",
458
       "      <td>1</td>\n",
459
       "    </tr>\n",
460
       "    <tr>\n",
461
       "      <th>746126</th>\n",
462
       "      <td>ID_fa19cd5ea9</td>\n",
463
       "      <td>ID_fdbfb2c17</td>\n",
464
       "      <td>1</td>\n",
465
       "      <td>0</td>\n",
466
       "      <td>0</td>\n",
467
       "      <td>0</td>\n",
468
       "      <td>0</td>\n",
469
       "      <td>1</td>\n",
470
       "    </tr>\n",
471
       "    <tr>\n",
472
       "      <th>253266</th>\n",
473
       "      <td>ID_fa19cd5ea9</td>\n",
474
       "      <td>ID_55f7bbbf2</td>\n",
475
       "      <td>1</td>\n",
476
       "      <td>0</td>\n",
477
       "      <td>0</td>\n",
478
       "      <td>0</td>\n",
479
       "      <td>0</td>\n",
480
       "      <td>1</td>\n",
481
       "    </tr>\n",
482
       "    <tr>\n",
483
       "      <th>549211</th>\n",
484
       "      <td>ID_fa19cd5ea9</td>\n",
485
       "      <td>ID_ba7080372</td>\n",
486
       "      <td>1</td>\n",
487
       "      <td>0</td>\n",
488
       "      <td>0</td>\n",
489
       "      <td>0</td>\n",
490
       "      <td>0</td>\n",
491
       "      <td>1</td>\n",
492
       "    </tr>\n",
493
       "    <tr>\n",
494
       "      <th>592856</th>\n",
495
       "      <td>ID_fa19cd5ea9</td>\n",
496
       "      <td>ID_c964e4096</td>\n",
497
       "      <td>1</td>\n",
498
       "      <td>0</td>\n",
499
       "      <td>0</td>\n",
500
       "      <td>0</td>\n",
501
       "      <td>0</td>\n",
502
       "      <td>1</td>\n",
503
       "    </tr>\n",
504
       "    <tr>\n",
505
       "      <th>183149</th>\n",
506
       "      <td>ID_fa19cd5ea9</td>\n",
507
       "      <td>ID_3e31d57d0</td>\n",
508
       "      <td>1</td>\n",
509
       "      <td>0</td>\n",
510
       "      <td>0</td>\n",
511
       "      <td>0</td>\n",
512
       "      <td>0</td>\n",
513
       "      <td>1</td>\n",
514
       "    </tr>\n",
515
       "    <tr>\n",
516
       "      <th>306771</th>\n",
517
       "      <td>ID_fa19cd5ea9</td>\n",
518
       "      <td>ID_680b2194c</td>\n",
519
       "      <td>1</td>\n",
520
       "      <td>0</td>\n",
521
       "      <td>0</td>\n",
522
       "      <td>0</td>\n",
523
       "      <td>0</td>\n",
524
       "      <td>1</td>\n",
525
       "    </tr>\n",
526
       "    <tr>\n",
527
       "      <th>540358</th>\n",
528
       "      <td>ID_fa19cd5ea9</td>\n",
529
       "      <td>ID_b76b13444</td>\n",
530
       "      <td>1</td>\n",
531
       "      <td>0</td>\n",
532
       "      <td>0</td>\n",
533
       "      <td>0</td>\n",
534
       "      <td>0</td>\n",
535
       "      <td>1</td>\n",
536
       "    </tr>\n",
537
       "    <tr>\n",
538
       "      <th>645217</th>\n",
539
       "      <td>ID_fa19cd5ea9</td>\n",
540
       "      <td>ID_db48a633d</td>\n",
541
       "      <td>1</td>\n",
542
       "      <td>0</td>\n",
543
       "      <td>0</td>\n",
544
       "      <td>0</td>\n",
545
       "      <td>0</td>\n",
546
       "      <td>1</td>\n",
547
       "    </tr>\n",
548
       "    <tr>\n",
549
       "      <th>270974</th>\n",
550
       "      <td>ID_fa19cd5ea9</td>\n",
551
       "      <td>ID_5bf2ca43f</td>\n",
552
       "      <td>1</td>\n",
553
       "      <td>0</td>\n",
554
       "      <td>0</td>\n",
555
       "      <td>0</td>\n",
556
       "      <td>0</td>\n",
557
       "      <td>1</td>\n",
558
       "    </tr>\n",
559
       "    <tr>\n",
560
       "      <th>672814</th>\n",
561
       "      <td>ID_fa19cd5ea9</td>\n",
562
       "      <td>ID_e4b636907</td>\n",
563
       "      <td>1</td>\n",
564
       "      <td>0</td>\n",
565
       "      <td>0</td>\n",
566
       "      <td>0</td>\n",
567
       "      <td>0</td>\n",
568
       "      <td>1</td>\n",
569
       "    </tr>\n",
570
       "    <tr>\n",
571
       "      <th>350834</th>\n",
572
       "      <td>ID_fa19cd5ea9</td>\n",
573
       "      <td>ID_7714ead69</td>\n",
574
       "      <td>1</td>\n",
575
       "      <td>0</td>\n",
576
       "      <td>0</td>\n",
577
       "      <td>0</td>\n",
578
       "      <td>0</td>\n",
579
       "      <td>1</td>\n",
580
       "    </tr>\n",
581
       "    <tr>\n",
582
       "      <th>749886</th>\n",
583
       "      <td>ID_fa19cd5ea9</td>\n",
584
       "      <td>ID_ff012ee5b</td>\n",
585
       "      <td>1</td>\n",
586
       "      <td>0</td>\n",
587
       "      <td>0</td>\n",
588
       "      <td>0</td>\n",
589
       "      <td>0</td>\n",
590
       "      <td>1</td>\n",
591
       "    </tr>\n",
592
       "    <tr>\n",
593
       "      <th>523978</th>\n",
594
       "      <td>ID_fa19cd5ea9</td>\n",
595
       "      <td>ID_b1cea5abb</td>\n",
596
       "      <td>1</td>\n",
597
       "      <td>0</td>\n",
598
       "      <td>0</td>\n",
599
       "      <td>0</td>\n",
600
       "      <td>0</td>\n",
601
       "      <td>1</td>\n",
602
       "    </tr>\n",
603
       "    <tr>\n",
604
       "      <th>464942</th>\n",
605
       "      <td>ID_fa19cd5ea9</td>\n",
606
       "      <td>ID_9dad2eb09</td>\n",
607
       "      <td>1</td>\n",
608
       "      <td>0</td>\n",
609
       "      <td>0</td>\n",
610
       "      <td>0</td>\n",
611
       "      <td>0</td>\n",
612
       "      <td>1</td>\n",
613
       "    </tr>\n",
614
       "    <tr>\n",
615
       "      <th>229881</th>\n",
616
       "      <td>ID_fa19cd5ea9</td>\n",
617
       "      <td>ID_4e14d0fe8</td>\n",
618
       "      <td>1</td>\n",
619
       "      <td>0</td>\n",
620
       "      <td>0</td>\n",
621
       "      <td>0</td>\n",
622
       "      <td>0</td>\n",
623
       "      <td>1</td>\n",
624
       "    </tr>\n",
625
       "    <tr>\n",
626
       "      <th>186237</th>\n",
627
       "      <td>ID_fa19cd5ea9</td>\n",
628
       "      <td>ID_3f422852d</td>\n",
629
       "      <td>1</td>\n",
630
       "      <td>0</td>\n",
631
       "      <td>0</td>\n",
632
       "      <td>0</td>\n",
633
       "      <td>0</td>\n",
634
       "      <td>1</td>\n",
635
       "    </tr>\n",
636
       "    <tr>\n",
637
       "      <th>599624</th>\n",
638
       "      <td>ID_fa19cd5ea9</td>\n",
639
       "      <td>ID_cbbb50e6d</td>\n",
640
       "      <td>1</td>\n",
641
       "      <td>0</td>\n",
642
       "      <td>0</td>\n",
643
       "      <td>0</td>\n",
644
       "      <td>0</td>\n",
645
       "      <td>1</td>\n",
646
       "    </tr>\n",
647
       "    <tr>\n",
648
       "      <th>347055</th>\n",
649
       "      <td>ID_fa19cd5ea9</td>\n",
650
       "      <td>ID_75cbdae68</td>\n",
651
       "      <td>1</td>\n",
652
       "      <td>0</td>\n",
653
       "      <td>0</td>\n",
654
       "      <td>0</td>\n",
655
       "      <td>0</td>\n",
656
       "      <td>1</td>\n",
657
       "    </tr>\n",
658
       "    <tr>\n",
659
       "      <th>359450</th>\n",
660
       "      <td>ID_fa19cd5ea9</td>\n",
661
       "      <td>ID_7a02fdbea</td>\n",
662
       "      <td>1</td>\n",
663
       "      <td>0</td>\n",
664
       "      <td>0</td>\n",
665
       "      <td>0</td>\n",
666
       "      <td>0</td>\n",
667
       "      <td>1</td>\n",
668
       "    </tr>\n",
669
       "    <tr>\n",
670
       "      <th>127205</th>\n",
671
       "      <td>ID_fa19cd5ea9</td>\n",
672
       "      <td>ID_2b3671dd9</td>\n",
673
       "      <td>1</td>\n",
674
       "      <td>0</td>\n",
675
       "      <td>0</td>\n",
676
       "      <td>0</td>\n",
677
       "      <td>0</td>\n",
678
       "      <td>1</td>\n",
679
       "    </tr>\n",
680
       "    <tr>\n",
681
       "      <th>148587</th>\n",
682
       "      <td>ID_fa19cd5ea9</td>\n",
683
       "      <td>ID_3274f5977</td>\n",
684
       "      <td>0</td>\n",
685
       "      <td>0</td>\n",
686
       "      <td>0</td>\n",
687
       "      <td>0</td>\n",
688
       "      <td>0</td>\n",
689
       "      <td>0</td>\n",
690
       "    </tr>\n",
691
       "    <tr>\n",
692
       "      <th>413641</th>\n",
693
       "      <td>ID_fa19cd5ea9</td>\n",
694
       "      <td>ID_8c5fc9e44</td>\n",
695
       "      <td>0</td>\n",
696
       "      <td>0</td>\n",
697
       "      <td>0</td>\n",
698
       "      <td>0</td>\n",
699
       "      <td>0</td>\n",
700
       "      <td>0</td>\n",
701
       "    </tr>\n",
702
       "    <tr>\n",
703
       "      <th>688538</th>\n",
704
       "      <td>ID_fa19cd5ea9</td>\n",
705
       "      <td>ID_ea2861e9a</td>\n",
706
       "      <td>0</td>\n",
707
       "      <td>0</td>\n",
708
       "      <td>0</td>\n",
709
       "      <td>0</td>\n",
710
       "      <td>0</td>\n",
711
       "      <td>0</td>\n",
712
       "    </tr>\n",
713
       "    <tr>\n",
714
       "      <th>318670</th>\n",
715
       "      <td>ID_fa19cd5ea9</td>\n",
716
       "      <td>ID_6c19c9f7b</td>\n",
717
       "      <td>0</td>\n",
718
       "      <td>0</td>\n",
719
       "      <td>0</td>\n",
720
       "      <td>0</td>\n",
721
       "      <td>0</td>\n",
722
       "      <td>0</td>\n",
723
       "    </tr>\n",
724
       "    <tr>\n",
725
       "      <th>630472</th>\n",
726
       "      <td>ID_fa19cd5ea9</td>\n",
727
       "      <td>ID_d6435f3bf</td>\n",
728
       "      <td>0</td>\n",
729
       "      <td>0</td>\n",
730
       "      <td>0</td>\n",
731
       "      <td>0</td>\n",
732
       "      <td>0</td>\n",
733
       "      <td>0</td>\n",
734
       "    </tr>\n",
735
       "    <tr>\n",
736
       "      <th>202656</th>\n",
737
       "      <td>ID_fa19cd5ea9</td>\n",
738
       "      <td>ID_44d57858e</td>\n",
739
       "      <td>0</td>\n",
740
       "      <td>0</td>\n",
741
       "      <td>0</td>\n",
742
       "      <td>0</td>\n",
743
       "      <td>0</td>\n",
744
       "      <td>0</td>\n",
745
       "    </tr>\n",
746
       "  </tbody>\n",
747
       "</table>\n",
748
       "</div>"
749
      ],
750
      "text/plain": [
751
       "       SeriesInstanceUID           fid  any  epidural  intraparenchymal  \\\n",
752
       "577964     ID_fa19cd5ea9  ID_c45659d3d    0         0                 0   \n",
753
       "229790     ID_fa19cd5ea9  ID_4e0bdd2ba    0         0                 0   \n",
754
       "22395      ID_fa19cd5ea9  ID_079945c27    1         0                 0   \n",
755
       "746126     ID_fa19cd5ea9  ID_fdbfb2c17    1         0                 0   \n",
756
       "253266     ID_fa19cd5ea9  ID_55f7bbbf2    1         0                 0   \n",
757
       "549211     ID_fa19cd5ea9  ID_ba7080372    1         0                 0   \n",
758
       "592856     ID_fa19cd5ea9  ID_c964e4096    1         0                 0   \n",
759
       "183149     ID_fa19cd5ea9  ID_3e31d57d0    1         0                 0   \n",
760
       "306771     ID_fa19cd5ea9  ID_680b2194c    1         0                 0   \n",
761
       "540358     ID_fa19cd5ea9  ID_b76b13444    1         0                 0   \n",
762
       "645217     ID_fa19cd5ea9  ID_db48a633d    1         0                 0   \n",
763
       "270974     ID_fa19cd5ea9  ID_5bf2ca43f    1         0                 0   \n",
764
       "672814     ID_fa19cd5ea9  ID_e4b636907    1         0                 0   \n",
765
       "350834     ID_fa19cd5ea9  ID_7714ead69    1         0                 0   \n",
766
       "749886     ID_fa19cd5ea9  ID_ff012ee5b    1         0                 0   \n",
767
       "523978     ID_fa19cd5ea9  ID_b1cea5abb    1         0                 0   \n",
768
       "464942     ID_fa19cd5ea9  ID_9dad2eb09    1         0                 0   \n",
769
       "229881     ID_fa19cd5ea9  ID_4e14d0fe8    1         0                 0   \n",
770
       "186237     ID_fa19cd5ea9  ID_3f422852d    1         0                 0   \n",
771
       "599624     ID_fa19cd5ea9  ID_cbbb50e6d    1         0                 0   \n",
772
       "347055     ID_fa19cd5ea9  ID_75cbdae68    1         0                 0   \n",
773
       "359450     ID_fa19cd5ea9  ID_7a02fdbea    1         0                 0   \n",
774
       "127205     ID_fa19cd5ea9  ID_2b3671dd9    1         0                 0   \n",
775
       "148587     ID_fa19cd5ea9  ID_3274f5977    0         0                 0   \n",
776
       "413641     ID_fa19cd5ea9  ID_8c5fc9e44    0         0                 0   \n",
777
       "688538     ID_fa19cd5ea9  ID_ea2861e9a    0         0                 0   \n",
778
       "318670     ID_fa19cd5ea9  ID_6c19c9f7b    0         0                 0   \n",
779
       "630472     ID_fa19cd5ea9  ID_d6435f3bf    0         0                 0   \n",
780
       "202656     ID_fa19cd5ea9  ID_44d57858e    0         0                 0   \n",
781
       "\n",
782
       "        intraventricular  subarachnoid  subdural  \n",
783
       "577964                 0             0         0  \n",
784
       "229790                 0             0         0  \n",
785
       "22395                  0             0         1  \n",
786
       "746126                 0             0         1  \n",
787
       "253266                 0             0         1  \n",
788
       "549211                 0             0         1  \n",
789
       "592856                 0             0         1  \n",
790
       "183149                 0             0         1  \n",
791
       "306771                 0             0         1  \n",
792
       "540358                 0             0         1  \n",
793
       "645217                 0             0         1  \n",
794
       "270974                 0             0         1  \n",
795
       "672814                 0             0         1  \n",
796
       "350834                 0             0         1  \n",
797
       "749886                 0             0         1  \n",
798
       "523978                 0             0         1  \n",
799
       "464942                 0             0         1  \n",
800
       "229881                 0             0         1  \n",
801
       "186237                 0             0         1  \n",
802
       "599624                 0             0         1  \n",
803
       "347055                 0             0         1  \n",
804
       "359450                 0             0         1  \n",
805
       "127205                 0             0         1  \n",
806
       "148587                 0             0         0  \n",
807
       "413641                 0             0         0  \n",
808
       "688538                 0             0         0  \n",
809
       "318670                 0             0         0  \n",
810
       "630472                 0             0         0  \n",
811
       "202656                 0             0         0  "
812
      ]
813
     },
814
     "execution_count": 9,
815
     "metadata": {},
816
     "output_type": "execute_result"
817
    }
818
   ],
819
   "source": [
820
    "subg"
821
   ]
822
  },
823
  {
824
   "cell_type": "code",
825
   "execution_count": 10,
826
   "metadata": {},
827
   "outputs": [],
828
   "source": [
829
    "# You can use a btrfs snapshot and rename files by study_id and z-pos through the brain\n",
830
    "def rename_train_group(subg):\n",
831
    "    ix = 0\n",
832
    "    total = len(subg)\n",
833
    "    for index, row in subg.iterrows():\n",
834
    "        cur_fn = row['fid']\n",
835
    "        new_fn = f\"{row['SeriesInstanceUID']}_{ix:03}_{total:03}_{row['any']}_{row['epidural']}_{row['intraparenchymal']}_{row['intraventricular']}_{row['subarachnoid']}_{row['subdural']}_{cur_fn}\"\n",
836
    "        ix += 1\n",
837
    "        Path(f'data/unzip_renamed/{stage}_train_images/{cur_fn}.dcm').rename(f'data/unzip_renamed/{stage}_train_images/{new_fn}.dcm')"
838
   ]
839
  },
840
  {
841
   "cell_type": "code",
842
   "execution_count": 11,
843
   "metadata": {},
844
   "outputs": [],
845
   "source": [
846
    "def index_group(subg, study_ix_to_fn, fn_to_study_ix):\n",
847
    "    ix = 0\n",
848
    "    for index, row in subg.iterrows():\n",
849
    "        fn = row['SOPInstanceUID']\n",
850
    "        study = row['SeriesInstanceUID']\n",
851
    "        study_ix_to_fn[study].append(fn)\n",
852
    "        fn_to_study_ix[fn] = (study, ix)\n",
853
    "        ix += 1  "
854
   ]
855
  },
856
  {
857
   "cell_type": "code",
858
   "execution_count": 12,
859
   "metadata": {},
860
   "outputs": [],
861
   "source": [
862
    "labels = [ 'any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural' ]\n",
863
    "\n",
864
    "def label_group(subg, fn_to_labels):\n",
865
    "    for index, row in subg.iterrows():\n",
866
    "        fn = row['SOPInstanceUID']\n",
867
    "        fn_to_labels[fn] = [ label for label in labels if row[label] == 1 ]"
868
   ]
869
  },
870
  {
871
   "cell_type": "code",
872
   "execution_count": 13,
873
   "metadata": {},
874
   "outputs": [],
875
   "source": [
876
    "train_study_ix_to_fn = defaultdict(list)\n",
877
    "train_fn_to_study_ix = {}\n",
878
    "train_fn_to_labels = {}\n",
879
    "\n",
880
    "for name, subg in gs:\n",
881
    "    #rename_train_group(subg)\n",
882
    "    index_group(subg, train_study_ix_to_fn, train_fn_to_study_ix)\n",
883
    "    label_group(subg, train_fn_to_labels)"
884
   ]
885
  },
886
  {
887
   "cell_type": "code",
888
   "execution_count": 14,
889
   "metadata": {},
890
   "outputs": [],
891
   "source": [
892
    "# Do not pickle yet, merge with test\n",
893
    "pickle.dump(train_study_ix_to_fn, open(f\"data/{stage}_train_study_ix_to_fn.pickle\", \"wb\" ))\n",
894
    "pickle.dump(train_fn_to_study_ix, open(f\"data/{stage}_train_fn_to_study_ix.pickle\", \"wb\" ))"
895
   ]
896
  },
897
  {
898
   "cell_type": "code",
899
   "execution_count": 15,
900
   "metadata": {},
901
   "outputs": [
902
    {
903
     "data": {
904
      "text/plain": [
905
       "['any', 'subdural']"
906
      ]
907
     },
908
     "execution_count": 15,
909
     "metadata": {},
910
     "output_type": "execute_result"
911
    }
912
   ],
913
   "source": [
914
    "train_fn_to_labels['ID_079945c27']"
915
   ]
916
  },
917
  {
918
   "cell_type": "code",
919
   "execution_count": 16,
920
   "metadata": {},
921
   "outputs": [],
922
   "source": [
923
    "pickle.dump(train_fn_to_labels, open(f\"data/{stage}_train_fn_to_labels.pickle\", 'wb'))"
924
   ]
925
  },
926
  {
927
   "cell_type": "markdown",
928
   "metadata": {},
929
   "source": [
930
    "# Test dataset"
931
   ]
932
  },
933
  {
934
   "cell_type": "code",
935
   "execution_count": 17,
936
   "metadata": {},
937
   "outputs": [],
938
   "source": [
939
    "df = pd.read_csv(f\"data/{stage}_test_dicom.csv\")"
940
   ]
941
  },
942
  {
943
   "cell_type": "code",
944
   "execution_count": 18,
945
   "metadata": {},
946
   "outputs": [
947
    {
948
     "data": {
949
      "text/html": [
950
       "<div>\n",
951
       "<style scoped>\n",
952
       "    .dataframe tbody tr th:only-of-type {\n",
953
       "        vertical-align: middle;\n",
954
       "    }\n",
955
       "\n",
956
       "    .dataframe tbody tr th {\n",
957
       "        vertical-align: top;\n",
958
       "    }\n",
959
       "\n",
960
       "    .dataframe thead th {\n",
961
       "        text-align: right;\n",
962
       "    }\n",
963
       "</style>\n",
964
       "<table border=\"1\" class=\"dataframe\">\n",
965
       "  <thead>\n",
966
       "    <tr style=\"text-align: right;\">\n",
967
       "      <th></th>\n",
968
       "      <th>Unnamed: 0</th>\n",
969
       "      <th>BitsAllocated</th>\n",
970
       "      <th>BitsStored</th>\n",
971
       "      <th>Columns</th>\n",
972
       "      <th>HighBit</th>\n",
973
       "      <th>ImageOrientationPatient_0</th>\n",
974
       "      <th>ImageOrientationPatient_1</th>\n",
975
       "      <th>ImageOrientationPatient_2</th>\n",
976
       "      <th>ImageOrientationPatient_3</th>\n",
977
       "      <th>ImageOrientationPatient_4</th>\n",
978
       "      <th>...</th>\n",
979
       "      <th>SamplesPerPixel</th>\n",
980
       "      <th>SeriesInstanceUID</th>\n",
981
       "      <th>StudyID</th>\n",
982
       "      <th>StudyInstanceUID</th>\n",
983
       "      <th>WindowCenter</th>\n",
984
       "      <th>WindowCenter_0</th>\n",
985
       "      <th>WindowCenter_1</th>\n",
986
       "      <th>WindowWidth</th>\n",
987
       "      <th>WindowWidth_0</th>\n",
988
       "      <th>WindowWidth_1</th>\n",
989
       "    </tr>\n",
990
       "  </thead>\n",
991
       "  <tbody>\n",
992
       "    <tr>\n",
993
       "      <th>0</th>\n",
994
       "      <td>0</td>\n",
995
       "      <td>16</td>\n",
996
       "      <td>12</td>\n",
997
       "      <td>512</td>\n",
998
       "      <td>11</td>\n",
999
       "      <td>1.0</td>\n",
1000
       "      <td>0.0</td>\n",
1001
       "      <td>0.0</td>\n",
1002
       "      <td>0.0</td>\n",
1003
       "      <td>0.981627</td>\n",
1004
       "      <td>...</td>\n",
1005
       "      <td>1</td>\n",
1006
       "      <td>ID_4d28912ba6</td>\n",
1007
       "      <td>NaN</td>\n",
1008
       "      <td>ID_1f6d1e8aeb</td>\n",
1009
       "      <td>NaN</td>\n",
1010
       "      <td>40.0</td>\n",
1011
       "      <td>40.0</td>\n",
1012
       "      <td>NaN</td>\n",
1013
       "      <td>80.0</td>\n",
1014
       "      <td>80.0</td>\n",
1015
       "    </tr>\n",
1016
       "    <tr>\n",
1017
       "      <th>1</th>\n",
1018
       "      <td>1</td>\n",
1019
       "      <td>16</td>\n",
1020
       "      <td>16</td>\n",
1021
       "      <td>512</td>\n",
1022
       "      <td>15</td>\n",
1023
       "      <td>1.0</td>\n",
1024
       "      <td>0.0</td>\n",
1025
       "      <td>0.0</td>\n",
1026
       "      <td>0.0</td>\n",
1027
       "      <td>0.987688</td>\n",
1028
       "      <td>...</td>\n",
1029
       "      <td>1</td>\n",
1030
       "      <td>ID_acabdeee86</td>\n",
1031
       "      <td>NaN</td>\n",
1032
       "      <td>ID_4a8d7ec19f</td>\n",
1033
       "      <td>30.0</td>\n",
1034
       "      <td>NaN</td>\n",
1035
       "      <td>NaN</td>\n",
1036
       "      <td>80.0</td>\n",
1037
       "      <td>NaN</td>\n",
1038
       "      <td>NaN</td>\n",
1039
       "    </tr>\n",
1040
       "    <tr>\n",
1041
       "      <th>2</th>\n",
1042
       "      <td>2</td>\n",
1043
       "      <td>16</td>\n",
1044
       "      <td>16</td>\n",
1045
       "      <td>512</td>\n",
1046
       "      <td>15</td>\n",
1047
       "      <td>1.0</td>\n",
1048
       "      <td>0.0</td>\n",
1049
       "      <td>0.0</td>\n",
1050
       "      <td>0.0</td>\n",
1051
       "      <td>0.927184</td>\n",
1052
       "      <td>...</td>\n",
1053
       "      <td>1</td>\n",
1054
       "      <td>ID_d00cee7f0c</td>\n",
1055
       "      <td>NaN</td>\n",
1056
       "      <td>ID_a6ca244172</td>\n",
1057
       "      <td>30.0</td>\n",
1058
       "      <td>NaN</td>\n",
1059
       "      <td>NaN</td>\n",
1060
       "      <td>80.0</td>\n",
1061
       "      <td>NaN</td>\n",
1062
       "      <td>NaN</td>\n",
1063
       "    </tr>\n",
1064
       "    <tr>\n",
1065
       "      <th>3</th>\n",
1066
       "      <td>3</td>\n",
1067
       "      <td>16</td>\n",
1068
       "      <td>16</td>\n",
1069
       "      <td>512</td>\n",
1070
       "      <td>15</td>\n",
1071
       "      <td>1.0</td>\n",
1072
       "      <td>0.0</td>\n",
1073
       "      <td>0.0</td>\n",
1074
       "      <td>0.0</td>\n",
1075
       "      <td>0.986286</td>\n",
1076
       "      <td>...</td>\n",
1077
       "      <td>1</td>\n",
1078
       "      <td>ID_a52a0112d5</td>\n",
1079
       "      <td>NaN</td>\n",
1080
       "      <td>ID_fa950a03af</td>\n",
1081
       "      <td>30.0</td>\n",
1082
       "      <td>NaN</td>\n",
1083
       "      <td>NaN</td>\n",
1084
       "      <td>80.0</td>\n",
1085
       "      <td>NaN</td>\n",
1086
       "      <td>NaN</td>\n",
1087
       "    </tr>\n",
1088
       "    <tr>\n",
1089
       "      <th>4</th>\n",
1090
       "      <td>4</td>\n",
1091
       "      <td>16</td>\n",
1092
       "      <td>12</td>\n",
1093
       "      <td>512</td>\n",
1094
       "      <td>11</td>\n",
1095
       "      <td>1.0</td>\n",
1096
       "      <td>0.0</td>\n",
1097
       "      <td>0.0</td>\n",
1098
       "      <td>0.0</td>\n",
1099
       "      <td>1.000000</td>\n",
1100
       "      <td>...</td>\n",
1101
       "      <td>1</td>\n",
1102
       "      <td>ID_f552d3b922</td>\n",
1103
       "      <td>NaN</td>\n",
1104
       "      <td>ID_965d8b3d8e</td>\n",
1105
       "      <td>NaN</td>\n",
1106
       "      <td>36.0</td>\n",
1107
       "      <td>36.0</td>\n",
1108
       "      <td>NaN</td>\n",
1109
       "      <td>80.0</td>\n",
1110
       "      <td>80.0</td>\n",
1111
       "    </tr>\n",
1112
       "  </tbody>\n",
1113
       "</table>\n",
1114
       "<p>5 rows × 34 columns</p>\n",
1115
       "</div>"
1116
      ],
1117
      "text/plain": [
1118
       "   Unnamed: 0  BitsAllocated  BitsStored  Columns  HighBit  \\\n",
1119
       "0           0             16          12      512       11   \n",
1120
       "1           1             16          16      512       15   \n",
1121
       "2           2             16          16      512       15   \n",
1122
       "3           3             16          16      512       15   \n",
1123
       "4           4             16          12      512       11   \n",
1124
       "\n",
1125
       "   ImageOrientationPatient_0  ImageOrientationPatient_1  \\\n",
1126
       "0                        1.0                        0.0   \n",
1127
       "1                        1.0                        0.0   \n",
1128
       "2                        1.0                        0.0   \n",
1129
       "3                        1.0                        0.0   \n",
1130
       "4                        1.0                        0.0   \n",
1131
       "\n",
1132
       "   ImageOrientationPatient_2  ImageOrientationPatient_3  \\\n",
1133
       "0                        0.0                        0.0   \n",
1134
       "1                        0.0                        0.0   \n",
1135
       "2                        0.0                        0.0   \n",
1136
       "3                        0.0                        0.0   \n",
1137
       "4                        0.0                        0.0   \n",
1138
       "\n",
1139
       "   ImageOrientationPatient_4  ...  SamplesPerPixel  SeriesInstanceUID  \\\n",
1140
       "0                   0.981627  ...                1      ID_4d28912ba6   \n",
1141
       "1                   0.987688  ...                1      ID_acabdeee86   \n",
1142
       "2                   0.927184  ...                1      ID_d00cee7f0c   \n",
1143
       "3                   0.986286  ...                1      ID_a52a0112d5   \n",
1144
       "4                   1.000000  ...                1      ID_f552d3b922   \n",
1145
       "\n",
1146
       "   StudyID  StudyInstanceUID WindowCenter WindowCenter_0 WindowCenter_1  \\\n",
1147
       "0      NaN     ID_1f6d1e8aeb          NaN           40.0           40.0   \n",
1148
       "1      NaN     ID_4a8d7ec19f         30.0            NaN            NaN   \n",
1149
       "2      NaN     ID_a6ca244172         30.0            NaN            NaN   \n",
1150
       "3      NaN     ID_fa950a03af         30.0            NaN            NaN   \n",
1151
       "4      NaN     ID_965d8b3d8e          NaN           36.0           36.0   \n",
1152
       "\n",
1153
       "   WindowWidth  WindowWidth_0  WindowWidth_1  \n",
1154
       "0          NaN           80.0           80.0  \n",
1155
       "1         80.0            NaN            NaN  \n",
1156
       "2         80.0            NaN            NaN  \n",
1157
       "3         80.0            NaN            NaN  \n",
1158
       "4          NaN           80.0           80.0  \n",
1159
       "\n",
1160
       "[5 rows x 34 columns]"
1161
      ]
1162
     },
1163
     "execution_count": 18,
1164
     "metadata": {},
1165
     "output_type": "execute_result"
1166
    }
1167
   ],
1168
   "source": [
1169
    "df.head()"
1170
   ]
1171
  },
1172
  {
1173
   "cell_type": "code",
1174
   "execution_count": 19,
1175
   "metadata": {},
1176
   "outputs": [
1177
    {
1178
     "data": {
1179
      "text/plain": [
1180
       "3518"
1181
      ]
1182
     },
1183
     "execution_count": 19,
1184
     "metadata": {},
1185
     "output_type": "execute_result"
1186
    }
1187
   ],
1188
   "source": [
1189
    "# sort, then group by (order is preserver within groups)\n",
1190
    "gs = df.sort_values('ImagePositionPatient_2').groupby('SeriesInstanceUID')\n",
1191
    "len(gs)"
1192
   ]
1193
  },
1194
  {
1195
   "cell_type": "code",
1196
   "execution_count": 20,
1197
   "metadata": {},
1198
   "outputs": [],
1199
   "source": [
1200
    "def rename_test_group(subg):\n",
1201
    "    ix = 0\n",
1202
    "    total = len(subg)\n",
1203
    "    for index, row in subg.iterrows():\n",
1204
    "        cur_fn = row['SOPInstanceUID']\n",
1205
    "        new_fn = f\"{row['SeriesInstanceUID']}_{ix:03}_{total:03}_{cur_fn}\"\n",
1206
    "        ix += 1\n",
1207
    "        Path(f'data/unzip_renamed/{stage}_test_images/{cur_fn}.dcm').rename(f'data/unzip_renamed/{stage}_test_images/{new_fn}.dcm')\n"
1208
   ]
1209
  },
1210
  {
1211
   "cell_type": "code",
1212
   "execution_count": 21,
1213
   "metadata": {},
1214
   "outputs": [],
1215
   "source": [
1216
    "test_study_ix_to_fn = defaultdict(list)\n",
1217
    "test_fn_to_study_ix = {}\n",
1218
    "\n",
1219
    "for name, subg in gs:\n",
1220
    "    #rename_test_group(subg)\n",
1221
    "    index_group(subg, test_study_ix_to_fn, test_fn_to_study_ix)\n"
1222
   ]
1223
  },
1224
  {
1225
   "cell_type": "code",
1226
   "execution_count": 22,
1227
   "metadata": {},
1228
   "outputs": [],
1229
   "source": [
1230
    "pickle.dump(test_study_ix_to_fn, open(f\"data/{stage}_test_study_ix_to_fn.pickle\", \"wb\" ))\n",
1231
    "pickle.dump(test_fn_to_study_ix, open(f\"data/{stage}_test_fn_to_study_ix.pickle\", \"wb\" ))"
1232
   ]
1233
  },
1234
  {
1235
   "cell_type": "code",
1236
   "execution_count": 23,
1237
   "metadata": {},
1238
   "outputs": [],
1239
   "source": [
1240
    "study_ix_to_fn = { **train_study_ix_to_fn, **test_study_ix_to_fn }\n",
1241
    "fn_to_study_ix = { **train_fn_to_study_ix, **test_fn_to_study_ix }"
1242
   ]
1243
  },
1244
  {
1245
   "cell_type": "code",
1246
   "execution_count": 24,
1247
   "metadata": {},
1248
   "outputs": [],
1249
   "source": [
1250
    "pickle.dump(study_ix_to_fn, open(f\"data/{stage}_study_ix_to_fn.pickle\", \"wb\" ))\n",
1251
    "pickle.dump(fn_to_study_ix, open(f\"data/{stage}_fn_to_study_ix.pickle\", \"wb\" ))"
1252
   ]
1253
  }
1254
 ],
1255
 "metadata": {
1256
  "kernelspec": {
1257
   "display_name": "Python 3",
1258
   "language": "python",
1259
   "name": "python3"
1260
  },
1261
  "language_info": {
1262
   "codemirror_mode": {
1263
    "name": "ipython",
1264
    "version": 3
1265
   },
1266
   "file_extension": ".py",
1267
   "mimetype": "text/x-python",
1268
   "name": "python",
1269
   "nbconvert_exporter": "python",
1270
   "pygments_lexer": "ipython3",
1271
   "version": "3.7.3"
1272
  }
1273
 },
1274
 "nbformat": 4,
1275
 "nbformat_minor": 2
1276
}