Switch to unified view

a b/src/preprocess/02_event_static.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "id": "bf6469fe",
6
   "metadata": {},
7
   "source": [
8
    "import os\n",
9
    "import sys\n",
10
    "\n",
11
    "src_path = os.path.abspath('../..')\n",
12
    "print(src_path)\n",
13
    "sys.path.append(src_path)"
14
   ],
15
   "outputs": [],
16
   "execution_count": null
17
  },
18
  {
19
   "cell_type": "code",
20
   "id": "fcd74d2c",
21
   "metadata": {},
22
   "source": [
23
    "from src.utils import create_directory, raw_data_path, processed_data_path, set_seed"
24
   ],
25
   "outputs": [],
26
   "execution_count": null
27
  },
28
  {
29
   "cell_type": "code",
30
   "id": "4cae42b3",
31
   "metadata": {},
32
   "source": [
33
    "set_seed(seed=42)"
34
   ],
35
   "outputs": [],
36
   "execution_count": null
37
  },
38
  {
39
   "cell_type": "code",
40
   "id": "57e86fc8",
41
   "metadata": {},
42
   "source": [
43
    "import pandas as pd"
44
   ],
45
   "outputs": [],
46
   "execution_count": null
47
  },
48
  {
49
   "cell_type": "code",
50
   "id": "d5a4a2f7",
51
   "metadata": {},
52
   "source": [
53
    "mimic_iv_path = os.path.join(raw_data_path, \"physionet.org/files/mimiciv/2.2\")\n",
54
    "mimic_iv_note_path = os.path.join(raw_data_path, \"physionet.org/files/mimic-iv-note/2.2\")\n",
55
    "output_path = os.path.join(processed_data_path, \"mimic4\")"
56
   ],
57
   "outputs": [],
58
   "execution_count": null
59
  },
60
  {
61
   "cell_type": "code",
62
   "id": "3d241540",
63
   "metadata": {},
64
   "source": [
65
    "cohort = pd.read_csv(os.path.join(output_path, \"cohort.csv\"))\n",
66
    "print(cohort.shape)\n",
67
    "cohort.head()"
68
   ],
69
   "outputs": [],
70
   "execution_count": null
71
  },
72
  {
73
   "cell_type": "code",
74
   "id": "00486178",
75
   "metadata": {},
76
   "source": [
77
    "cohort[\"hadm_intime\"] = pd.to_datetime(cohort[\"hadm_intime\"])\n",
78
    "cohort[\"hadm_outtime\"] = pd.to_datetime(cohort[\"hadm_outtime\"])\n",
79
    "cohort[\"stay_intime\"] = pd.to_datetime(cohort[\"stay_intime\"])\n",
80
    "cohort[\"stay_outtime\"] = pd.to_datetime(cohort[\"stay_outtime\"])"
81
   ],
82
   "outputs": [],
83
   "execution_count": null
84
  },
85
  {
86
   "cell_type": "code",
87
   "id": "64cc5546",
88
   "metadata": {},
89
   "source": [
90
    "hadm_ids = set(cohort.hadm_id.unique().tolist())\n",
91
    "len(hadm_ids)"
92
   ],
93
   "outputs": [],
94
   "execution_count": null
95
  },
96
  {
97
   "cell_type": "markdown",
98
   "id": "4774f5d5",
99
   "metadata": {},
100
   "source": [
101
    "helper"
102
   ]
103
  },
104
  {
105
   "cell_type": "code",
106
   "id": "ff882410",
107
   "metadata": {},
108
   "source": [
109
    "from concurrent.futures import ThreadPoolExecutor\n",
110
    "from tqdm import tqdm\n",
111
    "from pandarallel import pandarallel"
112
   ],
113
   "outputs": [],
114
   "execution_count": null
115
  },
116
  {
117
   "cell_type": "code",
118
   "id": "f841dfbb",
119
   "metadata": {},
120
   "source": [
121
    "pandarallel.initialize(progress_bar=True)"
122
   ],
123
   "outputs": [],
124
   "execution_count": null
125
  },
126
  {
127
   "cell_type": "code",
128
   "id": "65b989bc",
129
   "metadata": {},
130
   "source": [
131
    "def save_group(group_df, hadm_id, event_type):\n",
132
    "    file_path = f\"{output_path}/event_{event_type}/event_{int(hadm_id)}.csv\"\n",
133
    "    group_df.to_csv(file_path, index=False)\n",
134
    "    return True"
135
   ],
136
   "outputs": [],
137
   "execution_count": null
138
  },
139
  {
140
   "cell_type": "markdown",
141
   "id": "29ca0184",
142
   "metadata": {},
143
   "source": [
144
    "## patients"
145
   ]
146
  },
147
  {
148
   "cell_type": "code",
149
   "id": "ce3d37a4",
150
   "metadata": {},
151
   "source": [
152
    "patients = pd.read_csv(os.path.join(mimic_iv_path, \"hosp/patients.csv.gz\"))\n",
153
    "print(patients.shape)\n",
154
    "patients.head()"
155
   ],
156
   "outputs": [],
157
   "execution_count": null
158
  },
159
  {
160
   "cell_type": "code",
161
   "id": "96e9a2d4",
162
   "metadata": {},
163
   "source": [
164
    "cohort = cohort.merge(patients[[\"subject_id\", \"gender\", \"anchor_age\", \"anchor_year\"]], on=\"subject_id\", how=\"inner\")\n",
165
    "cohort[\"age\"] = cohort.hadm_intime.dt.year - cohort.anchor_year + cohort.anchor_age\n",
166
    "print(cohort.shape)\n",
167
    "cohort.head()"
168
   ],
169
   "outputs": [],
170
   "execution_count": null
171
  },
172
  {
173
   "cell_type": "code",
174
   "id": "7e56a347",
175
   "metadata": {},
176
   "source": [
177
    "print(cohort.age.min())\n",
178
    "print(cohort.age.max())\n",
179
    "print(cohort.age.mean())\n",
180
    "print(cohort.age.std())"
181
   ],
182
   "outputs": [],
183
   "execution_count": null
184
  },
185
  {
186
   "cell_type": "code",
187
   "id": "a3dc3f7d",
188
   "metadata": {},
189
   "source": [
190
    "cohort.gender.value_counts()"
191
   ],
192
   "outputs": [],
193
   "execution_count": null
194
  },
195
  {
196
   "cell_type": "markdown",
197
   "id": "685091a4",
198
   "metadata": {},
199
   "source": [
200
    "## admissions"
201
   ]
202
  },
203
  {
204
   "cell_type": "code",
205
   "id": "20da413d",
206
   "metadata": {},
207
   "source": [
208
    "admissions = pd.read_csv(os.path.join(mimic_iv_path, \"hosp/admissions.csv.gz\"))\n",
209
    "print(admissions.shape)\n",
210
    "admissions.head()"
211
   ],
212
   "outputs": [],
213
   "execution_count": null
214
  },
215
  {
216
   "cell_type": "code",
217
   "id": "e5796ed0",
218
   "metadata": {},
219
   "source": [
220
    "cohort = cohort.merge(admissions[[\"subject_id\", \"hadm_id\", \"admission_type\", \"admission_location\", \"insurance\", \"language\", \"marital_status\", \"race\"]], on=[\"subject_id\", \"hadm_id\"], how=\"inner\")\n",
221
    "print(cohort.shape)\n",
222
    "cohort.head()"
223
   ],
224
   "outputs": [],
225
   "execution_count": null
226
  },
227
  {
228
   "cell_type": "markdown",
229
   "id": "3b1486b0",
230
   "metadata": {},
231
   "source": [
232
    "## discharge"
233
   ]
234
  },
235
  {
236
   "cell_type": "code",
237
   "id": "f3c47a56",
238
   "metadata": {},
239
   "source": [
240
    "discharge = pd.read_csv(os.path.join(mimic_iv_note_path, \"note/discharge.csv.gz\"))\n",
241
    "print(discharge.shape)\n",
242
    "discharge.head()"
243
   ],
244
   "outputs": [],
245
   "execution_count": null
246
  },
247
  {
248
   "cell_type": "code",
249
   "id": "695c5b5a",
250
   "metadata": {},
251
   "source": [
252
    "import re\n",
253
    "\n",
254
    "def extract_chief_complaint(discharge_summary):\n",
255
    "    # Define the regex pattern to capture the Chief Complaint text\n",
256
    "    # The pattern looks for the literal string \"Chief Complaint:\" followed by any characters until the first newline\n",
257
    "    pattern = r\"(Chief Complaint|___ Complaint):\\s*(.+?)\\s*\\n\"\n",
258
    "    \n",
259
    "    # Search for the pattern in the discharge summary\n",
260
    "    match = re.search(pattern, discharge_summary)\n",
261
    "    \n",
262
    "    # If a match is found, return the captured group; otherwise, return None\n",
263
    "    if match:\n",
264
    "        return match.group(2).strip()  # Use strip to remove any extra whitespace\n",
265
    "    else:\n",
266
    "        return None"
267
   ],
268
   "outputs": [],
269
   "execution_count": null
270
  },
271
  {
272
   "cell_type": "code",
273
   "id": "9aa30a39",
274
   "metadata": {},
275
   "source": [
276
    "extract_chief_complaint(discharge.iloc[42332].text)"
277
   ],
278
   "outputs": [],
279
   "execution_count": null
280
  },
281
  {
282
   "cell_type": "code",
283
   "id": "7f32adac",
284
   "metadata": {},
285
   "source": [
286
    "extract_chief_complaint(discharge.iloc[4332].text)"
287
   ],
288
   "outputs": [],
289
   "execution_count": null
290
  },
291
  {
292
   "cell_type": "code",
293
   "id": "8339a776",
294
   "metadata": {},
295
   "source": [
296
    "discharge[\"chief_complaint\"] = discharge.text.parallel_apply(extract_chief_complaint)"
297
   ],
298
   "outputs": [],
299
   "execution_count": null
300
  },
301
  {
302
   "cell_type": "code",
303
   "id": "bb36888f",
304
   "metadata": {},
305
   "source": [
306
    "discharge.head()"
307
   ],
308
   "outputs": [],
309
   "execution_count": null
310
  },
311
  {
312
   "cell_type": "code",
313
   "id": "c251a181",
314
   "metadata": {},
315
   "source": [
316
    "discharge.isna().sum()"
317
   ],
318
   "outputs": [],
319
   "execution_count": null
320
  },
321
  {
322
   "cell_type": "code",
323
   "id": "88ceedbd",
324
   "metadata": {},
325
   "source": [
326
    "cohort = cohort.merge(discharge[[\"subject_id\", \"hadm_id\", \"chief_complaint\"]], on=[\"subject_id\", \"hadm_id\"], how=\"inner\")\n",
327
    "print(cohort.shape)\n",
328
    "cohort.head()"
329
   ],
330
   "outputs": [],
331
   "execution_count": null
332
  },
333
  {
334
   "cell_type": "markdown",
335
   "id": "2f2eefa1",
336
   "metadata": {},
337
   "source": [
338
    "## post-process"
339
   ]
340
  },
341
  {
342
   "cell_type": "code",
343
   "id": "6f504122",
344
   "metadata": {},
345
   "source": [
346
    "cohort = cohort.drop(columns=[\"anchor_age\", \"anchor_year\"])\n",
347
    "cohort.head()"
348
   ],
349
   "outputs": [],
350
   "execution_count": null
351
  },
352
  {
353
   "cell_type": "code",
354
   "id": "048fdb57",
355
   "metadata": {},
356
   "source": [
357
    "cohort.isna().sum()"
358
   ],
359
   "outputs": [],
360
   "execution_count": null
361
  },
362
  {
363
   "cell_type": "code",
364
   "id": "192fb0a1",
365
   "metadata": {},
366
   "source": [
367
    "cohort.admission_type.unique()"
368
   ],
369
   "outputs": [],
370
   "execution_count": null
371
  },
372
  {
373
   "cell_type": "code",
374
   "id": "fd7ea73b",
375
   "metadata": {},
376
   "source": [
377
    "cohort.admission_location.unique()"
378
   ],
379
   "outputs": [],
380
   "execution_count": null
381
  },
382
  {
383
   "cell_type": "code",
384
   "id": "2ef33de0",
385
   "metadata": {},
386
   "source": [
387
    "cohort.insurance.unique()"
388
   ],
389
   "outputs": [],
390
   "execution_count": null
391
  },
392
  {
393
   "cell_type": "code",
394
   "id": "01d704cc",
395
   "metadata": {},
396
   "source": [
397
    "cohort.language.unique()"
398
   ],
399
   "outputs": [],
400
   "execution_count": null
401
  },
402
  {
403
   "cell_type": "code",
404
   "id": "85d70f6f",
405
   "metadata": {},
406
   "source": [
407
    "cohort.marital_status.unique()"
408
   ],
409
   "outputs": [],
410
   "execution_count": null
411
  },
412
  {
413
   "cell_type": "code",
414
   "id": "4a4047f3",
415
   "metadata": {},
416
   "source": [
417
    "cohort.race.unique()"
418
   ],
419
   "outputs": [],
420
   "execution_count": null
421
  },
422
  {
423
   "cell_type": "code",
424
   "id": "de006e4c",
425
   "metadata": {},
426
   "source": "event_type = \"patient_demographics\"",
427
   "outputs": [],
428
   "execution_count": null
429
  },
430
  {
431
   "cell_type": "code",
432
   "id": "41fa6397",
433
   "metadata": {},
434
   "source": [
435
    "def generate_event_value(x):\n",
436
    "    s = f\"gender: {x.gender}, age: {x.age}, race: {x.race}\"\n",
437
    "    if not pd.isna(x.marital_status):\n",
438
    "        s += f\", marital status: {x.marital_status}\"\n",
439
    "    s += f\", insurance: {x.insurance}\"\n",
440
    "    return s"
441
   ],
442
   "outputs": [],
443
   "execution_count": null
444
  },
445
  {
446
   "cell_type": "code",
447
   "id": "3d53e74e",
448
   "metadata": {},
449
   "source": [
450
    "meta_cols = [\"gender\", \"age\", \"race\", \"marital_status\", \"insurance\"]\n",
451
    "for c in meta_cols:\n",
452
    "    cohort[\"meta_\" + c] = cohort[c]\n",
453
    "meta_cols = [\"meta_\" + c for c in meta_cols]"
454
   ],
455
   "outputs": [],
456
   "execution_count": null
457
  },
458
  {
459
   "cell_type": "code",
460
   "id": "1cf76546",
461
   "metadata": {},
462
   "source": [
463
    "cohort[\"timestamp\"] = 0\n",
464
    "cohort[\"timestamp_avail\"] = 0"
465
   ],
466
   "outputs": [],
467
   "execution_count": null
468
  },
469
  {
470
   "cell_type": "code",
471
   "id": "88c506ec",
472
   "metadata": {},
473
   "source": [
474
    "print(generate_event_value(cohort.iloc[5]))"
475
   ],
476
   "outputs": [],
477
   "execution_count": null
478
  },
479
  {
480
   "cell_type": "code",
481
   "id": "eba10521",
482
   "metadata": {},
483
   "source": [
484
    "print(generate_event_value(cohort.iloc[520]))"
485
   ],
486
   "outputs": [],
487
   "execution_count": null
488
  },
489
  {
490
   "cell_type": "code",
491
   "id": "5d832292",
492
   "metadata": {},
493
   "source": [
494
    "cohort[\"event_type\"] = event_type\n",
495
    "cohort[\"event_value\"] = cohort.parallel_apply(generate_event_value, axis=1)"
496
   ],
497
   "outputs": [],
498
   "execution_count": null
499
  },
500
  {
501
   "cell_type": "code",
502
   "id": "3c46789d",
503
   "metadata": {},
504
   "source": [
505
    "cohort[cohort.hadm_id == 29079034]"
506
   ],
507
   "outputs": [],
508
   "execution_count": null
509
  },
510
  {
511
   "cell_type": "code",
512
   "id": "bee6b1f6",
513
   "metadata": {},
514
   "source": [
515
    "cohort.groupby(\"hadm_id\").event_type.count().describe()"
516
   ],
517
   "outputs": [],
518
   "execution_count": null
519
  },
520
  {
521
   "cell_type": "code",
522
   "id": "56e27ab3",
523
   "metadata": {},
524
   "source": [
525
    "!rm -r {output_path}/'event_{event_type}'"
526
   ],
527
   "outputs": [],
528
   "execution_count": null
529
  },
530
  {
531
   "cell_type": "code",
532
   "id": "30433d47",
533
   "metadata": {},
534
   "source": [
535
    "create_directory(f\"{output_path}/event_{event_type}\")"
536
   ],
537
   "outputs": [],
538
   "execution_count": null
539
  },
540
  {
541
   "cell_type": "code",
542
   "id": "0ad33569",
543
   "metadata": {},
544
   "source": [
545
    "groups = cohort.groupby(\"hadm_id\")\n",
546
    "    \n",
547
    "with ThreadPoolExecutor(max_workers=4) as executor:\n",
548
    "    for hadm_id, group_df in tqdm(groups, total=groups.ngroups):\n",
549
    "        future = executor.submit(\n",
550
    "            save_group, \n",
551
    "            group_df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"] + meta_cols], \n",
552
    "            hadm_id, \n",
553
    "            event_type\n",
554
    "        )"
555
   ],
556
   "outputs": [],
557
   "execution_count": null
558
  },
559
  {
560
   "cell_type": "code",
561
   "id": "43e3f09a",
562
   "metadata": {},
563
   "source": [
564
    "!ls -1 {output_path}/'event_{event_type}' | wc -l"
565
   ],
566
   "outputs": [],
567
   "execution_count": null
568
  },
569
  {
570
   "cell_type": "code",
571
   "id": "5ec6d393",
572
   "metadata": {},
573
   "source": "event_type = \"admission_info\"",
574
   "outputs": [],
575
   "execution_count": null
576
  },
577
  {
578
   "cell_type": "code",
579
   "id": "73f5aa52",
580
   "metadata": {},
581
   "source": [
582
    "def generate_event_value(x):\n",
583
    "    s = f\"type: {x.admission_type}, location: {x.admission_location}\"\n",
584
    "    if not pd.isna(x.chief_complaint):\n",
585
    "        s += f\", chief complaint: {x.chief_complaint}\"\n",
586
    "    return s"
587
   ],
588
   "outputs": [],
589
   "execution_count": null
590
  },
591
  {
592
   "cell_type": "code",
593
   "id": "8c1d9ea5",
594
   "metadata": {},
595
   "source": [
596
    "meta_cols = [\"admission_type\", \"admission_location\", \"chief_complaint\"]\n",
597
    "for c in meta_cols:\n",
598
    "    cohort[\"meta_\" + c] = cohort[c]\n",
599
    "meta_cols = [\"meta_\" + c for c in meta_cols]"
600
   ],
601
   "outputs": [],
602
   "execution_count": null
603
  },
604
  {
605
   "cell_type": "code",
606
   "id": "814a4d14",
607
   "metadata": {},
608
   "source": [
609
    "print(generate_event_value(cohort.iloc[5]))"
610
   ],
611
   "outputs": [],
612
   "execution_count": null
613
  },
614
  {
615
   "cell_type": "code",
616
   "id": "a3041145",
617
   "metadata": {},
618
   "source": [
619
    "print(generate_event_value(cohort.iloc[520]))"
620
   ],
621
   "outputs": [],
622
   "execution_count": null
623
  },
624
  {
625
   "cell_type": "code",
626
   "id": "cf03657c",
627
   "metadata": {},
628
   "source": [
629
    "cohort[\"event_type\"] = event_type\n",
630
    "cohort[\"event_value\"] = cohort.parallel_apply(generate_event_value, axis=1)"
631
   ],
632
   "outputs": [],
633
   "execution_count": null
634
  },
635
  {
636
   "cell_type": "code",
637
   "id": "3eeaa8ae",
638
   "metadata": {},
639
   "source": [
640
    "cohort[cohort.hadm_id == 29079034]"
641
   ],
642
   "outputs": [],
643
   "execution_count": null
644
  },
645
  {
646
   "cell_type": "code",
647
   "id": "6af53072",
648
   "metadata": {},
649
   "source": [
650
    "cohort.groupby(\"hadm_id\").event_type.count().describe()"
651
   ],
652
   "outputs": [],
653
   "execution_count": null
654
  },
655
  {
656
   "cell_type": "code",
657
   "id": "2aa672f8",
658
   "metadata": {},
659
   "source": [
660
    "!rm -r {output_path}/'event_{event_type}'"
661
   ],
662
   "outputs": [],
663
   "execution_count": null
664
  },
665
  {
666
   "cell_type": "code",
667
   "id": "161a75a7",
668
   "metadata": {},
669
   "source": [
670
    "create_directory(f\"{output_path}/event_{event_type}\")"
671
   ],
672
   "outputs": [],
673
   "execution_count": null
674
  },
675
  {
676
   "cell_type": "code",
677
   "id": "334a9676",
678
   "metadata": {},
679
   "source": [
680
    "groups = cohort.groupby(\"hadm_id\")\n",
681
    "    \n",
682
    "with ThreadPoolExecutor(max_workers=4) as executor:\n",
683
    "    for hadm_id, group_df in tqdm(groups, total=groups.ngroups):\n",
684
    "        future = executor.submit(\n",
685
    "            save_group, \n",
686
    "            group_df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"] + meta_cols], \n",
687
    "            hadm_id, \n",
688
    "            event_type\n",
689
    "        )"
690
   ],
691
   "outputs": [],
692
   "execution_count": null
693
  },
694
  {
695
   "cell_type": "code",
696
   "id": "318c6395",
697
   "metadata": {},
698
   "source": [
699
    "!ls -1 {output_path}/'event_{event_type}' | wc -l"
700
   ],
701
   "outputs": [],
702
   "execution_count": null
703
  },
704
  {
705
   "cell_type": "code",
706
   "id": "a8e66a9f",
707
   "metadata": {},
708
   "source": [],
709
   "outputs": [],
710
   "execution_count": null
711
  }
712
 ],
713
 "metadata": {
714
  "kernelspec": {
715
   "display_name": "pytorch20",
716
   "language": "python",
717
   "name": "pytorch20"
718
  },
719
  "language_info": {
720
   "codemirror_mode": {
721
    "name": "ipython",
722
    "version": 3
723
   },
724
   "file_extension": ".py",
725
   "mimetype": "text/x-python",
726
   "name": "python",
727
   "nbconvert_exporter": "python",
728
   "pygments_lexer": "ipython3",
729
   "version": "3.9.19"
730
  }
731
 },
732
 "nbformat": 4,
733
 "nbformat_minor": 5
734
}