Switch to unified view

a b/analysis/data_analysis.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "markdown",
5
   "metadata": {},
6
   "source": [
7
    "# Dataset Preparation for Prediction of Imminent ICU Admission and Prolonged Stay"
8
   ]
9
  },
10
  {
11
   "cell_type": "markdown",
12
   "metadata": {},
13
   "source": [
14
    "## Imports & Inits"
15
   ]
16
  },
17
  {
18
   "cell_type": "code",
19
   "execution_count": 4,
20
   "metadata": {
21
    "ExecuteTime": {
22
     "end_time": "2019-08-26T18:54:33.858025Z",
23
     "start_time": "2019-08-26T18:54:33.683791Z"
24
    }
25
   },
26
   "outputs": [
27
    {
28
     "name": "stdout",
29
     "output_type": "stream",
30
     "text": [
31
      "The autoreload extension is already loaded. To reload it, use:\n",
32
      "  %reload_ext autoreload\n"
33
     ]
34
    }
35
   ],
36
   "source": [
37
    "%load_ext autoreload\n",
38
    "%autoreload 2"
39
   ]
40
  },
41
  {
42
   "cell_type": "code",
43
   "execution_count": 5,
44
   "metadata": {
45
    "ExecuteTime": {
46
     "end_time": "2019-08-26T18:54:33.919510Z",
47
     "start_time": "2019-08-26T18:54:33.887213Z"
48
    }
49
   },
50
   "outputs": [
51
    {
52
     "data": {
53
      "text/plain": [
54
       "{'workdir': PosixPath('../data/workdir'),\n",
55
       " 'figdir': PosixPath('../data/results/figures'),\n",
56
       " 'resultdir': PosixPath('../data/results'),\n",
57
       " 'dataset_csv': PosixPath('../data/proc_dataset.csv'),\n",
58
       " 'imminent_adm_cols': ['hadm_id', 'processed_note', 'imminent_adm_label'],\n",
59
       " 'prolonged_stay_cols': ['hadm_id', 'processed_note', 'prolonged_stay_label'],\n",
60
       " 'cols': ['hadm_id',\n",
61
       "  'imminent_adm_label',\n",
62
       "  'prolonged_stay_label',\n",
63
       "  'processed_note',\n",
64
       "  'charttime',\n",
65
       "  'intime',\n",
66
       "  'chartinterval'],\n",
67
       " 'dates': ['charttime', 'intime'],\n",
68
       " 'ia_thresh': {'lr': 0.45, 'rf': 0.27, 'gbm': 0.435, 'mlp': 0.2},\n",
69
       " 'ps_thresh': {'lr': 0.39, 'rf': 0.36, 'gbm': 0.324, 'mlp': 0.27}}"
70
      ]
71
     },
72
     "execution_count": 5,
73
     "metadata": {},
74
     "output_type": "execute_result"
75
    }
76
   ],
77
   "source": [
78
    "import sys\n",
79
    "sys.path.append('../')\n",
80
    "\n",
81
    "import math\n",
82
    "import numpy as np\n",
83
    "import pandas as pd\n",
84
    "import spacy\n",
85
    "\n",
86
    "import seaborn as sns\n",
87
    "sns.set(style = 'darkgrid')\n",
88
    "\n",
89
    "import matplotlib.pyplot as plt\n",
90
    "%matplotlib inline\n",
91
    "\n",
92
    "from scipy import stats\n",
93
    "from pathlib import Path\n",
94
    "\n",
95
    "from utils.splits import set_group_splits\n",
96
    "from args import args\n",
97
    "vars(args)"
98
   ]
99
  },
100
  {
101
   "cell_type": "markdown",
102
   "metadata": {
103
    "heading_collapsed": true
104
   },
105
   "source": [
106
    "## Stats"
107
   ]
108
  },
109
  {
110
   "cell_type": "code",
111
   "execution_count": null,
112
   "metadata": {
113
    "ExecuteTime": {
114
     "end_time": "2019-07-17T18:48:22.330446Z",
115
     "start_time": "2019-07-17T18:48:17.056668Z"
116
    },
117
    "hidden": true
118
   },
119
   "outputs": [],
120
   "source": [
121
    "df = pd.read_csv(args.dataset_csv)\n",
122
    "ia_df = df.loc[(df['imminent_adm_label'] != -1)][args.imminent_adm_cols].reset_index(drop=True)\n",
123
    "ps_df = ori_df.loc[(ori_df['chartinterval'] != 0)][args.prolonged_stay_cols].reset_index(drop=True)"
124
   ]
125
  },
126
  {
127
   "cell_type": "code",
128
   "execution_count": null,
129
   "metadata": {
130
    "ExecuteTime": {
131
     "end_time": "2019-07-17T18:48:22.402048Z",
132
     "start_time": "2019-07-17T18:48:22.333813Z"
133
    },
134
    "hidden": true
135
   },
136
   "outputs": [],
137
   "source": [
138
    "df['subject_id'].nunique(), df['hadm_id'].nunique()"
139
   ]
140
  },
141
  {
142
   "cell_type": "code",
143
   "execution_count": null,
144
   "metadata": {
145
    "ExecuteTime": {
146
     "end_time": "2019-07-17T18:48:29.519915Z",
147
     "start_time": "2019-07-17T18:48:29.443575Z"
148
    },
149
    "hidden": true
150
   },
151
   "outputs": [],
152
   "source": [
153
    "ages = df.groupby(['subject_id'])[['admission_age']].first().to_numpy().reshape(-1)\n",
154
    "ages[ages>100] = 100\n",
155
    "print(f\"Median age: {ages.mean():0.1f}\")\n",
156
    "print(f\"IQR: {np.percentile(ages, 25):0.1f} - {np.percentile(ages, 75):0.1f}\")"
157
   ]
158
  },
159
  {
160
   "cell_type": "code",
161
   "execution_count": null,
162
   "metadata": {
163
    "ExecuteTime": {
164
     "end_time": "2019-07-17T18:48:53.210075Z",
165
     "start_time": "2019-07-17T18:48:53.002656Z"
166
    },
167
    "hidden": true
168
   },
169
   "outputs": [],
170
   "source": [
171
    "df['adm_to_icu_period'].describe().reset_index()"
172
   ]
173
  },
174
  {
175
   "cell_type": "code",
176
   "execution_count": null,
177
   "metadata": {
178
    "ExecuteTime": {
179
     "end_time": "2019-07-17T18:49:21.918238Z",
180
     "start_time": "2019-07-17T18:49:21.653850Z"
181
    },
182
    "hidden": true
183
   },
184
   "outputs": [],
185
   "source": [
186
    "df.groupby(df['admission_type'])['hadm_id'].nunique().reset_index()"
187
   ]
188
  },
189
  {
190
   "cell_type": "code",
191
   "execution_count": null,
192
   "metadata": {
193
    "ExecuteTime": {
194
     "end_time": "2019-07-17T18:49:22.411510Z",
195
     "start_time": "2019-07-17T18:49:22.249868Z"
196
    },
197
    "hidden": true
198
   },
199
   "outputs": [],
200
   "source": [
201
    "df.groupby(df['ethnicity'])['subject_id'].nunique().reset_index()"
202
   ]
203
  },
204
  {
205
   "cell_type": "markdown",
206
   "metadata": {
207
    "hidden": true
208
   },
209
   "source": [
210
    "Make sure average prevalence of random test sets is approximately same as real prevalence"
211
   ]
212
  },
213
  {
214
   "cell_type": "code",
215
   "execution_count": null,
216
   "metadata": {
217
    "ExecuteTime": {
218
     "end_time": "2019-07-17T18:27:25.090684Z",
219
     "start_time": "2019-07-17T18:27:05.095345Z"
220
    },
221
    "hidden": true
222
   },
223
   "outputs": [],
224
   "source": [
225
    "ia_p = []\n",
226
    "ps_p = []\n",
227
    "\n",
228
    "for seed in range(127, 227):\n",
229
    "  sdf = set_group_splits(ia_df.copy(), group_col='hadm_id', seed=seed)\n",
230
    "  test_size = len(sdf.loc[(sdf['split'] == 'test')])\n",
231
    "  test_pos = len(sdf.loc[(sdf['split'] == 'test') & (sdf['imminent_adm_label'] == 1)])\n",
232
    "  ia_p.append(test_pos/test_size)  \n",
233
    "  \n",
234
    "  sdf = set_group_splits(ps_df.copy(), group_col='hadm_id', seed=seed)\n",
235
    "  test_size = len(sdf.loc[(sdf['split'] == 'test')])\n",
236
    "  test_pos = len(sdf.loc[(sdf['split'] == 'test') & (sdf['prolonged_stay_label'] == 1)])\n",
237
    "  ps_p.append(test_pos/test_size)  \n",
238
    "  \n",
239
    "\n",
240
    "ia_p = np.array(ia_p)\n",
241
    "ps_p = np.array(ps_p)\n",
242
    "\n",
243
    "print(f\"Prevalence of Imminent Admission: {(len(ia_df.loc[ia_df['imminent_adm_label'] == 1])/len(ia_df)):0.3f}\")\n",
244
    "print(f\"Average of test set = {(ia_p.mean()):0.3f}, std = {(ia_p.std()):0.3f}\")\n",
245
    "print(f\"Prevalence of Prolonged Stay: {(len(ps_df.loc[ps_df['prolonged_stay_label'] == 1])/len(ps_df)):0.3f}\")\n",
246
    "print(f\"Average of test set = {(ps_p.mean()):0.3f}, std = {(ps_p.std()):0.3f}\")"
247
   ]
248
  },
249
  {
250
   "cell_type": "code",
251
   "execution_count": null,
252
   "metadata": {
253
    "ExecuteTime": {
254
     "end_time": "2019-07-17T18:49:26.743454Z",
255
     "start_time": "2019-07-17T18:49:26.652037Z"
256
    },
257
    "hidden": true
258
   },
259
   "outputs": [],
260
   "source": [
261
    "print(f\"Average number of notes per admission for imminent admission: {ia_df.groupby('hadm_id').size().mean():0.2f}\")\n",
262
    "print(f\"Average number of notes per admission for prolonged stay (and entire dataset): {ps_df.groupby('hadm_id').size().mean():0.2f}\")"
263
   ]
264
  },
265
  {
266
   "cell_type": "code",
267
   "execution_count": null,
268
   "metadata": {
269
    "ExecuteTime": {
270
     "end_time": "2019-07-17T18:49:48.377622Z",
271
     "start_time": "2019-07-17T18:49:48.020027Z"
272
    },
273
    "hidden": true
274
   },
275
   "outputs": [],
276
   "source": [
277
    "df.groupby(df['deathtime'].apply(lambda x: True if pd.notnull(x) else False))['subject_id'].nunique().reset_index()"
278
   ]
279
  },
280
  {
281
   "cell_type": "code",
282
   "execution_count": null,
283
   "metadata": {
284
    "ExecuteTime": {
285
     "end_time": "2019-07-17T18:49:49.030699Z",
286
     "start_time": "2019-07-17T18:49:48.901279Z"
287
    },
288
    "hidden": true
289
   },
290
   "outputs": [],
291
   "source": [
292
    "df.groupby(df['gender'])['subject_id'].nunique().reset_index()"
293
   ]
294
  },
295
  {
296
   "cell_type": "markdown",
297
   "metadata": {
298
    "hidden": true
299
   },
300
   "source": [
301
    "Distribution of notes by category"
302
   ]
303
  },
304
  {
305
   "cell_type": "code",
306
   "execution_count": null,
307
   "metadata": {
308
    "ExecuteTime": {
309
     "end_time": "2019-07-17T18:49:51.015549Z",
310
     "start_time": "2019-07-17T18:49:50.883550Z"
311
    },
312
    "hidden": true
313
   },
314
   "outputs": [],
315
   "source": [
316
    "df.groupby(df['category']).size().reset_index()"
317
   ]
318
  },
319
  {
320
   "cell_type": "markdown",
321
   "metadata": {
322
    "hidden": true
323
   },
324
   "source": [
325
    "Distribution of notes by category for imminent admissions and delayed admissions"
326
   ]
327
  },
328
  {
329
   "cell_type": "code",
330
   "execution_count": null,
331
   "metadata": {
332
    "ExecuteTime": {
333
     "end_time": "2019-07-17T18:40:35.638983Z",
334
     "start_time": "2019-07-17T18:40:35.428118Z"
335
    },
336
    "hidden": true
337
   },
338
   "outputs": [],
339
   "source": [
340
    "df.loc[(df['imminent_adm_label'] == 1)].groupby('category').size().reset_index()"
341
   ]
342
  },
343
  {
344
   "cell_type": "code",
345
   "execution_count": null,
346
   "metadata": {
347
    "ExecuteTime": {
348
     "end_time": "2019-07-17T18:42:29.970948Z",
349
     "start_time": "2019-07-17T18:42:29.704004Z"
350
    },
351
    "hidden": true
352
   },
353
   "outputs": [],
354
   "source": [
355
    "df.loc[(df['imminent_adm_label'] == 0)].groupby('category').size().reset_index()"
356
   ]
357
  },
358
  {
359
   "cell_type": "markdown",
360
   "metadata": {
361
    "hidden": true
362
   },
363
   "source": [
364
    "Distribution of notes for prolonged stay and short stay"
365
   ]
366
  },
367
  {
368
   "cell_type": "code",
369
   "execution_count": null,
370
   "metadata": {
371
    "ExecuteTime": {
372
     "end_time": "2019-07-17T18:42:30.199045Z",
373
     "start_time": "2019-07-17T18:42:29.974531Z"
374
    },
375
    "hidden": true
376
   },
377
   "outputs": [],
378
   "source": [
379
    "df.loc[(df['prolonged_stay_label'] == 1)].groupby('category').size().reset_index()"
380
   ]
381
  },
382
  {
383
   "cell_type": "code",
384
   "execution_count": null,
385
   "metadata": {
386
    "ExecuteTime": {
387
     "end_time": "2019-07-17T18:42:30.334632Z",
388
     "start_time": "2019-07-17T18:42:30.202847Z"
389
    },
390
    "hidden": true
391
   },
392
   "outputs": [],
393
   "source": [
394
    "df.loc[(df['prolonged_stay_label'] == 0)].groupby('category').size().reset_index()"
395
   ]
396
  },
397
  {
398
   "cell_type": "code",
399
   "execution_count": null,
400
   "metadata": {
401
    "ExecuteTime": {
402
     "end_time": "2019-07-17T18:49:58.769470Z",
403
     "start_time": "2019-07-17T18:49:58.658678Z"
404
    },
405
    "hidden": true
406
   },
407
   "outputs": [],
408
   "source": [
409
    "df['icu_los'].describe().reset_index()"
410
   ]
411
  },
412
  {
413
   "cell_type": "code",
414
   "execution_count": null,
415
   "metadata": {
416
    "ExecuteTime": {
417
     "end_time": "2019-07-17T18:52:01.571726Z",
418
     "start_time": "2019-07-17T18:52:01.262084Z"
419
    },
420
    "hidden": true
421
   },
422
   "outputs": [],
423
   "source": [
424
    "df['note'].apply(len).describe().reset_index()"
425
   ]
426
  },
427
  {
428
   "cell_type": "code",
429
   "execution_count": null,
430
   "metadata": {
431
    "ExecuteTime": {
432
     "end_time": "2019-07-17T18:42:23.454748Z",
433
     "start_time": "2019-07-17T18:42:23.043855Z"
434
    },
435
    "hidden": true
436
   },
437
   "outputs": [],
438
   "source": [
439
    "df['charttime_to_icu_period'].describe().reset_index()"
440
   ]
441
  },
442
  {
443
   "cell_type": "markdown",
444
   "metadata": {},
445
   "source": [
446
    "## Plots"
447
   ]
448
  },
449
  {
450
   "cell_type": "code",
451
   "execution_count": 7,
452
   "metadata": {
453
    "ExecuteTime": {
454
     "end_time": "2019-08-26T18:54:53.518650Z",
455
     "start_time": "2019-08-26T18:54:50.602313Z"
456
    }
457
   },
458
   "outputs": [
459
    {
460
     "data": {
461
      "text/plain": [
462
       "Index(['subject_id', 'hadm_id', 'icustay_id', 'admission_type', 'admittime',\n",
463
       "       'dischtime', 'intime', 'outtime', 'charttime', 'icu_los', 'deathtime',\n",
464
       "       'adm_to_icu_period', 'charttime_to_icu_period', 'chartinterval',\n",
465
       "       'ethnicity', 'dob', 'gender', 'admission_age', 'category',\n",
466
       "       'imminent_adm_label', 'prolonged_stay_label', 'note', 'processed_note'],\n",
467
       "      dtype='object')"
468
      ]
469
     },
470
     "execution_count": 7,
471
     "metadata": {},
472
     "output_type": "execute_result"
473
    }
474
   ],
475
   "source": [
476
    "df = pd.read_csv(args.dataset_csv)\n",
477
    "df.columns"
478
   ]
479
  },
480
  {
481
   "cell_type": "code",
482
   "execution_count": 11,
483
   "metadata": {
484
    "ExecuteTime": {
485
     "end_time": "2019-08-26T18:56:35.688519Z",
486
     "start_time": "2019-08-26T18:56:35.448942Z"
487
    }
488
   },
489
   "outputs": [
490
    {
491
     "name": "stdout",
492
     "output_type": "stream",
493
     "text": [
494
      "Radiology\n",
495
      "VEN DUP EXTEXT BIL (MAP/DVT)\n",
496
      "[**2162-5-17**] 8:12 AM\n",
497
      " [**Last Name (un) 1296**] DUP EXTEXT BIL (MAP/DVT)                                    Clip # [**Clip Number (Radiology) 18833**]\n",
498
      " Reason: eval for vein harvesting*****OR second case on [**2162-5-17**]******\n",
499
      " Admitting Diagnosis: CORONARY ARTERY DISEASE\n",
500
      " ______________________________________________________________________________\n",
501
      " [**Hospital 2**] MEDICAL CONDITION:\n",
502
      "  60 year old man pre-op for CABG\n",
503
      " REASON FOR THIS EXAMINATION:\n",
504
      "  eval for vein harvesting*****OR second case on [**2162-5-17**]********\n",
505
      " ______________________________________________________________________________\n",
506
      "                                 FINAL REPORT\n",
507
      " HISTORY:  A 60-year-old gentleman, preop for CABG.  Search for conduit.\n",
508
      "\n",
509
      " TECHNIQUE:  Venous mapping of the superficial veins in the lower extremities\n",
510
      " was performed with [**Doctor Last Name 37**]-scale and Doppler ultrasound.\n",
511
      "\n",
512
      " FINDINGS:  Right great saphenous vein is patent and compressible with\n",
513
      " diameters ranging between 0.15 and 0.21 cm.  The right small saphenous vein is\n",
514
      " patent and compressible with diameters ranging between 0.14 and 0.20 cm.\n",
515
      "\n",
516
      " The left great saphenous vein presented with diameters ranging between 0.06\n",
517
      " and 0.49 cm.  It was not visualized below the calf.  The left small saphenous\n",
518
      " vein presented thick walled and calcified.\n",
519
      "\n",
520
      " COMPARISON:  None available.\n",
521
      "\n",
522
      " IMPRESSION:  Patent right great and small saphenous veins, with diameters\n",
523
      " described above.  Left great saphenous vein with small diameters below the mid\n",
524
      " thigh and not visualized below the calf.  The left small saphenous vein\n",
525
      " presented with thick walls and calcifications.\n",
526
      "\n",
527
      "\n",
528
      "\n"
529
     ]
530
    }
531
   ],
532
   "source": [
533
    "print(df.iloc[0]['note'])"
534
   ]
535
  },
536
  {
537
   "cell_type": "code",
538
   "execution_count": 12,
539
   "metadata": {
540
    "ExecuteTime": {
541
     "end_time": "2019-08-26T18:56:41.114437Z",
542
     "start_time": "2019-08-26T18:56:41.086224Z"
543
    }
544
   },
545
   "outputs": [
546
    {
547
     "name": "stdout",
548
     "output_type": "stream",
549
     "text": [
550
      "Radiology \n",
551
      " VEN DUP EXTEXT BIL ( MAP/DVT ) \n",
552
      " [ * * 2162 - 5 - 17 * * ] 8:12 AM \n",
553
      "  [ * * Last Name ( un ) 1296 * * ] DUP EXTEXT BIL ( MAP/DVT )                                     Clip # [ * * Clip Number ( Radiology ) 18833 * * ] \n",
554
      "  Reason : eval for vein harvesting*****OR second case on [ * * 2162 - 5 - 17 * * ] * * * * * * \n",
555
      "  Admitting Diagnosis : CORONARY ARTERY DISEASE \n",
556
      "  _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n",
557
      "  [ * * Hospital 2 * * ] MEDICAL CONDITION : \n",
558
      "   60 year old man pre-op for CABG \n",
559
      "  REASON FOR THIS EXAMINATION : \n",
560
      "   eval for vein harvesting*****OR second case on [ * * 2162 - 5 - 17 * * ] * * * * * * * * \n",
561
      "  _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n",
562
      "                                  FINAL REPORT \n",
563
      "  HISTORY :   A 60-year-old gentleman , preop for CABG .   Search for conduit . \n",
564
      "\n",
565
      "  TECHNIQUE :   Venous mapping of the superficial veins in the lower extremities \n",
566
      "  was performed with [ * * Doctor Last Name 37**]-scale and Doppler ultrasound . \n",
567
      "\n",
568
      "  FINDINGS :   Right great saphenous vein is patent and compressible with \n",
569
      "  diameters ranging between 0.15 and 0.21 cm .   The right small saphenous vein is \n",
570
      "  patent and compressible with diameters ranging between 0.14 and 0.20 cm . \n",
571
      "\n",
572
      "  The left great saphenous vein presented with diameters ranging between 0.06 \n",
573
      "  and 0.49 cm .   It was not visualized below the calf .   The left small saphenous \n",
574
      "  vein presented thick walled and calcified . \n",
575
      "\n",
576
      "  COMPARISON :   None available . \n",
577
      "\n",
578
      "  IMPRESSION :   Patent right great and small saphenous veins , with diameters \n",
579
      "  described above .   Left great saphenous vein with small diameters below the mid \n",
580
      "  thigh and not visualized below the calf .   The left small saphenous vein \n",
581
      "  presented with thick walls and calcifications . \n",
582
      "\n",
583
      "\n",
584
      "\n"
585
     ]
586
    }
587
   ],
588
   "source": [
589
    "print(df.iloc[0]['processed_note'])"
590
   ]
591
  },
592
  {
593
   "cell_type": "code",
594
   "execution_count": null,
595
   "metadata": {},
596
   "outputs": [],
597
   "source": []
598
  },
599
  {
600
   "cell_type": "code",
601
   "execution_count": null,
602
   "metadata": {
603
    "ExecuteTime": {
604
     "end_time": "2019-07-17T18:42:36.032162Z",
605
     "start_time": "2019-07-17T18:42:36.008573Z"
606
    }
607
   },
608
   "outputs": [],
609
   "source": [
610
    "intervals = ['-1 ≤ t ≤ 0']\n",
611
    "intervals += [f'-{i+1} ≤ t ≤ -{i}' for i in range(1, 15)]\n",
612
    "intervals.append('t ≤ -15')"
613
   ]
614
  },
615
  {
616
   "cell_type": "markdown",
617
   "metadata": {},
618
   "source": [
619
    "### Bar Plot of Notes Over Days"
620
   ]
621
  },
622
  {
623
   "cell_type": "markdown",
624
   "metadata": {},
625
   "source": [
626
    "#### All Notes"
627
   ]
628
  },
629
  {
630
   "cell_type": "code",
631
   "execution_count": null,
632
   "metadata": {
633
    "ExecuteTime": {
634
     "end_time": "2019-07-17T18:42:38.585605Z",
635
     "start_time": "2019-07-17T18:42:38.475792Z"
636
    }
637
   },
638
   "outputs": [],
639
   "source": [
640
    "plot_df = pd.DataFrame(df.groupby(['chartinterval']).size(), columns=['n_notes'])\n",
641
    "plot_df.reset_index(inplace=True, drop=True)\n",
642
    "plot_df['days'] = intervals"
643
   ]
644
  },
645
  {
646
   "cell_type": "code",
647
   "execution_count": null,
648
   "metadata": {
649
    "ExecuteTime": {
650
     "end_time": "2019-07-17T18:42:41.669474Z",
651
     "start_time": "2019-07-17T18:42:40.376671Z"
652
    }
653
   },
654
   "outputs": [],
655
   "source": [
656
    "fig, ax = plt.subplots(figsize=(15, 8))\n",
657
    "sns.barplot(x='days', y='n_notes', data=plot_df, ax=ax)\n",
658
    "ax.set_xticklabels(ax.get_xticklabels(),rotation=45, ha='right')\n",
659
    "ax.set_xlabel('Time to ICU Admission (days)')\n",
660
    "ax.set_ylabel('# notes')\n",
661
    "for index, row in plot_df.iterrows():\n",
662
    "    ax.text(index, row['n_notes'], str(row['n_notes']), color='black', ha='center', va='bottom')"
663
   ]
664
  },
665
  {
666
   "cell_type": "code",
667
   "execution_count": null,
668
   "metadata": {
669
    "ExecuteTime": {
670
     "end_time": "2019-06-27T00:38:13.014421Z",
671
     "start_time": "2019-06-27T00:38:12.991010Z"
672
    }
673
   },
674
   "outputs": [],
675
   "source": [
676
    "# fig.savefig(args.figdir/'note_bp.tif', dpi=300)"
677
   ]
678
  },
679
  {
680
   "cell_type": "markdown",
681
   "metadata": {},
682
   "source": [
683
    "#### By Category"
684
   ]
685
  },
686
  {
687
   "cell_type": "code",
688
   "execution_count": null,
689
   "metadata": {
690
    "ExecuteTime": {
691
     "end_time": "2019-07-17T18:43:10.933200Z",
692
     "start_time": "2019-07-17T18:43:10.805412Z"
693
    }
694
   },
695
   "outputs": [],
696
   "source": [
697
    "def plot_intervals(ax, df, cat):\n",
698
    "  sns.barplot(x='days', y='n_notes', data=df, ax=ax)\n",
699
    "  ax.set_xticklabels(ax.get_xticklabels(),rotation=45, ha='right')\n",
700
    "  ax.set_xlabel('')\n",
701
    "  ax.set_ylabel('')\n",
702
    "  ax.set_title(f\"Note Category: {cat}\\n# notes: {df['n_notes'].sum()}\")   \n",
703
    "\n",
704
    "  for index, (_, row) in enumerate(df.iterrows()):\n",
705
    "      ax.text(index, row['n_notes'], str(row['n_notes']), color='black', ha='center', va='bottom')    "
706
   ]
707
  },
708
  {
709
   "cell_type": "code",
710
   "execution_count": null,
711
   "metadata": {
712
    "ExecuteTime": {
713
     "end_time": "2019-07-17T18:43:12.728434Z",
714
     "start_time": "2019-07-17T18:43:12.610095Z"
715
    }
716
   },
717
   "outputs": [],
718
   "source": [
719
    "plot_df = pd.DataFrame(df.groupby(['category', 'chartinterval']).size(), columns=['n_notes'])\n",
720
    "plot_df.reset_index(inplace=True)\n",
721
    "plot_df['days'] = plot_df['chartinterval'].apply(lambda x: intervals[x])\n",
722
    "plot_df.drop(['chartinterval'], inplace=True, axis=1)"
723
   ]
724
  },
725
  {
726
   "cell_type": "code",
727
   "execution_count": null,
728
   "metadata": {
729
    "ExecuteTime": {
730
     "end_time": "2019-07-17T18:43:26.011143Z",
731
     "start_time": "2019-07-17T18:43:15.024678Z"
732
    },
733
    "scrolled": false
734
   },
735
   "outputs": [],
736
   "source": [
737
    "fig, ax = plt.subplots(6, 2, figsize=(20, 50))\n",
738
    "plot_intervals(ax[0][0], plot_df.loc[plot_df['category'] == 'Case Management ', ['n_notes', 'days']], 'Case Management')\n",
739
    "plot_intervals(ax[0][1], plot_df.loc[plot_df['category'] == 'Consult', ['n_notes', 'days']], 'Consult')\n",
740
    "\n",
741
    "plot_intervals(ax[1][0], plot_df.loc[plot_df['category'] == 'General', ['n_notes', 'days']], 'General')\n",
742
    "plot_intervals(ax[1][1], plot_df.loc[plot_df['category'] == 'Nursing', ['n_notes', 'days']], 'Nursing')\n",
743
    "\n",
744
    "plot_intervals(ax[2][0], plot_df.loc[plot_df['category'] == 'Nursing/other', ['n_notes', 'days']], 'Nursing/other')\n",
745
    "plot_intervals(ax[2][1], plot_df.loc[plot_df['category'] == 'Nutrition', ['n_notes', 'days']], 'Nutrition')\n",
746
    "\n",
747
    "plot_intervals(ax[3][0], plot_df.loc[plot_df['category'] == 'Pharmacy', ['n_notes', 'days']], 'Pharmacy')\n",
748
    "plot_intervals(ax[3][1], plot_df.loc[plot_df['category'] == 'Physician ', ['n_notes', 'days',]], 'Physician')\n",
749
    "\n",
750
    "plot_intervals(ax[4][0], plot_df.loc[plot_df['category'] == 'Radiology', ['n_notes', 'days']], 'Radiology')\n",
751
    "plot_intervals(ax[4][1], plot_df.loc[plot_df['category'] == 'Rehab Services', ['n_notes', 'days']], 'Rehab Services')\n",
752
    "\n",
753
    "plot_intervals(ax[5][0], plot_df.loc[plot_df['category'] == 'Respiratory ', ['n_notes', 'days']], 'Respiratory')\n",
754
    "plot_intervals(ax[5][1], plot_df.loc[plot_df['category'] == 'Social Work', ['n_notes', 'days']], 'Social Work')\n",
755
    "\n",
756
    "fig.text(0.5, 0.1, 'Time to ICU Admission (days)', ha='center')\n",
757
    "fig.text(0.08, 0.5, '# notes', va='center', rotation='vertical')\n",
758
    "\n",
759
    "plt.subplots_adjust(hspace = 0.3)"
760
   ]
761
  },
762
  {
763
   "cell_type": "code",
764
   "execution_count": null,
765
   "metadata": {
766
    "ExecuteTime": {
767
     "end_time": "2019-06-27T00:42:13.420913Z",
768
     "start_time": "2019-06-27T00:42:13.395654Z"
769
    }
770
   },
771
   "outputs": [],
772
   "source": [
773
    "# cats = sorted(list(df['category'].unique()))\n",
774
    "\n",
775
    "# n = 0\n",
776
    "# fig, ax = plt.subplots(1, 1, figsize=(10, 8))\n",
777
    "# plot_intervals(ax, plot_df.loc[plot_df['category'] == cats[n], ['n_notes', 'days']], cats[n])\n",
778
    "# ax.set_xlabel('Time to ICU Admission (days)')\n",
779
    "# ax.set_ylabel('# notes')"
780
   ]
781
  },
782
  {
783
   "cell_type": "code",
784
   "execution_count": null,
785
   "metadata": {
786
    "ExecuteTime": {
787
     "end_time": "2019-06-26T19:55:34.228896Z",
788
     "start_time": "2019-06-26T19:55:34.204962Z"
789
    },
790
    "scrolled": false
791
   },
792
   "outputs": [],
793
   "source": [
794
    "# fig.savefig(args.figdir/'note_cats_bp.tif', dpi=300)"
795
   ]
796
  },
797
  {
798
   "cell_type": "markdown",
799
   "metadata": {},
800
   "source": [
801
    "### Note Chart Time to ICU Admission Period Histogram"
802
   ]
803
  },
804
  {
805
   "cell_type": "markdown",
806
   "metadata": {},
807
   "source": [
808
    "#### All Notes"
809
   ]
810
  },
811
  {
812
   "cell_type": "code",
813
   "execution_count": null,
814
   "metadata": {
815
    "ExecuteTime": {
816
     "end_time": "2019-07-17T18:43:56.456943Z",
817
     "start_time": "2019-07-17T18:43:56.330361Z"
818
    }
819
   },
820
   "outputs": [],
821
   "source": [
822
    "plot_df = df[['category', 'charttime_to_icu_period']]"
823
   ]
824
  },
825
  {
826
   "cell_type": "code",
827
   "execution_count": null,
828
   "metadata": {
829
    "ExecuteTime": {
830
     "end_time": "2019-07-17T18:44:05.284861Z",
831
     "start_time": "2019-07-17T18:44:03.949948Z"
832
    }
833
   },
834
   "outputs": [],
835
   "source": [
836
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
837
    "\n",
838
    "sns.distplot(plot_df['charttime_to_icu_period'], kde=False, ax=ax, bins=80)\n",
839
    "ax.set_xlabel('Period between Note Chart Time and ICU Admission Time (days)')\n",
840
    "ax.set_ylabel('# notes')\n",
841
    "ax.set_xlim(0, 60)\n",
842
    "\n",
843
    "# ax.text(ax.get_xlim()[1]*0.50, ax.get_ylim()[1]*0.80, f\"Min: {mdf['chart_icu_period'].min()}, Avg: {mdf['chart_icu_period'].mean(): 0.2f}, Max: {mdf['chart_icu_period'].max()}\", fontweight='bold', fontsize=15, ha='center', va='bottom')"
844
   ]
845
  },
846
  {
847
   "cell_type": "code",
848
   "execution_count": null,
849
   "metadata": {
850
    "ExecuteTime": {
851
     "end_time": "2019-06-26T19:55:34.712151Z",
852
     "start_time": "2019-06-26T19:55:34.686551Z"
853
    }
854
   },
855
   "outputs": [],
856
   "source": [
857
    "# fig.savefig(args.figdir/'note_icu_period_hist.tif', dpi=300)"
858
   ]
859
  },
860
  {
861
   "cell_type": "markdown",
862
   "metadata": {},
863
   "source": [
864
    "#### By Category"
865
   ]
866
  },
867
  {
868
   "cell_type": "code",
869
   "execution_count": null,
870
   "metadata": {
871
    "ExecuteTime": {
872
     "end_time": "2019-07-17T18:44:13.676427Z",
873
     "start_time": "2019-07-17T18:44:13.571725Z"
874
    }
875
   },
876
   "outputs": [],
877
   "source": [
878
    "def plot_period(ax, df, cat):\n",
879
    "  sns.distplot(df, kde=False, ax=ax, bins=10)\n",
880
    "  ax.set_xlabel('')\n",
881
    "  ax.set_ylabel('')\n",
882
    "  ax.set_title(f\"Note Category: {cat}\")   "
883
   ]
884
  },
885
  {
886
   "cell_type": "code",
887
   "execution_count": null,
888
   "metadata": {
889
    "ExecuteTime": {
890
     "end_time": "2019-07-17T18:45:21.689010Z",
891
     "start_time": "2019-07-17T18:45:12.353337Z"
892
    },
893
    "scrolled": false
894
   },
895
   "outputs": [],
896
   "source": [
897
    "fig, ax = plt.subplots(6, 2, figsize=(20, 50))\n",
898
    "plot_period(ax[0][0], plot_df.loc[plot_df['category'] == 'Case Management ', ['charttime_to_icu_period']], 'Case Management')\n",
899
    "plot_period(ax[0][1], plot_df.loc[plot_df['category'] == 'Consult', ['charttime_to_icu_period']], 'Consult')\n",
900
    "\n",
901
    "plot_period(ax[1][0], plot_df.loc[plot_df['category'] == 'General', ['charttime_to_icu_period']], 'General')\n",
902
    "plot_period(ax[1][1], plot_df.loc[plot_df['category'] == 'Nursing', ['charttime_to_icu_period']], 'Nursing')\n",
903
    "\n",
904
    "plot_period(ax[2][0], plot_df.loc[plot_df['category'] == 'Nursing/other', ['charttime_to_icu_period']], 'Nursing/other')\n",
905
    "plot_period(ax[2][1], plot_df.loc[plot_df['category'] == 'Nutrition', ['charttime_to_icu_period']], 'Nutrition')\n",
906
    "\n",
907
    "plot_period(ax[3][0], plot_df.loc[plot_df['category'] == 'Pharmacy', ['charttime_to_icu_period']], 'Pharmacy')\n",
908
    "plot_period(ax[3][1], plot_df.loc[plot_df['category'] == 'Physician ', ['charttime_to_icu_period',]], 'Physician')\n",
909
    "\n",
910
    "plot_period(ax[4][0], plot_df.loc[plot_df['category'] == 'Radiology', ['charttime_to_icu_period']], 'Radiology')\n",
911
    "plot_period(ax[4][1], plot_df.loc[plot_df['category'] == 'Rehab Services', ['charttime_to_icu_period']], 'Rehab Services')\n",
912
    "\n",
913
    "plot_period(ax[5][0], plot_df.loc[plot_df['category'] == 'Respiratory ', ['charttime_to_icu_period']], 'Respiratory')\n",
914
    "plot_period(ax[5][1], plot_df.loc[plot_df['category'] == 'Social Work', ['charttime_to_icu_period']], 'Social Work')\n",
915
    "\n",
916
    "fig.text(0.5, 0.11, 'Period between Note Chart Time and ICU Admission Time (days)', ha='center')\n",
917
    "fig.text(0.08, 0.5, '# notes', va='center', rotation='vertical')\n",
918
    "\n",
919
    "plt.subplots_adjust(hspace = 0.1)"
920
   ]
921
  },
922
  {
923
   "cell_type": "code",
924
   "execution_count": null,
925
   "metadata": {
926
    "ExecuteTime": {
927
     "end_time": "2019-06-27T00:43:24.745337Z",
928
     "start_time": "2019-06-27T00:43:24.720208Z"
929
    }
930
   },
931
   "outputs": [],
932
   "source": [
933
    "# cats = sorted(list(df['category'].unique()))\n",
934
    "\n",
935
    "# n = 0\n",
936
    "# fig, ax = plt.subplots(1, 1, figsize=(10, 8))\n",
937
    "# plot_period(ax, plot_df.loc[plot_df['category'] == cats[n], ['chart_icu_period']], cats[n])\n",
938
    "# ax.set_xlabel('Time to ICU Admission (days)')\n",
939
    "# ax.set_ylabel('# notes')"
940
   ]
941
  },
942
  {
943
   "cell_type": "code",
944
   "execution_count": null,
945
   "metadata": {
946
    "ExecuteTime": {
947
     "end_time": "2019-06-26T19:35:38.476961Z",
948
     "start_time": "2019-06-26T19:35:38.451886Z"
949
    },
950
    "scrolled": false
951
   },
952
   "outputs": [],
953
   "source": [
954
    "# fig.savefig(args.figdir/'note_cat_icu_period_hist.tif', dpi=300)"
955
   ]
956
  },
957
  {
958
   "cell_type": "markdown",
959
   "metadata": {},
960
   "source": [
961
    "### Hospital Admission to ICU Admission Period Histogram"
962
   ]
963
  },
964
  {
965
   "cell_type": "code",
966
   "execution_count": null,
967
   "metadata": {
968
    "ExecuteTime": {
969
     "end_time": "2019-07-17T18:45:44.547021Z",
970
     "start_time": "2019-07-17T18:45:44.519812Z"
971
    }
972
   },
973
   "outputs": [],
974
   "source": [
975
    "plot_df = df[['adm_to_icu_period']]"
976
   ]
977
  },
978
  {
979
   "cell_type": "code",
980
   "execution_count": null,
981
   "metadata": {
982
    "ExecuteTime": {
983
     "end_time": "2019-07-17T18:45:46.580796Z",
984
     "start_time": "2019-07-17T18:45:45.217784Z"
985
    }
986
   },
987
   "outputs": [],
988
   "source": [
989
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
990
    "\n",
991
    "sns.distplot(plot_df, kde=False, ax=ax, bins=80)\n",
992
    "ax.set_xlabel('Time between hospital admission and ICU admission (days)')\n",
993
    "ax.set_ylabel('# notes')\n",
994
    "ax.set_xlim(0, 70)\n",
995
    "# ax.text(ax.get_xlim()[1]*0.50, ax.get_ylim()[1]*0.80, f\"Min: {mdf['adm_icu_period'].min()}, Avg: {mdf['adm_icu_period'].mean(): 0.2f}, Max: {mdf['adm_icu_period'].max()}\", fontweight='bold', fontsize=15, ha='center', va='bottom')    "
996
   ]
997
  },
998
  {
999
   "cell_type": "code",
1000
   "execution_count": null,
1001
   "metadata": {},
1002
   "outputs": [],
1003
   "source": [
1004
    "# fig.savefig(args.figdir/'adm_icu_period_hist.tif', dpi=300)"
1005
   ]
1006
  },
1007
  {
1008
   "cell_type": "markdown",
1009
   "metadata": {},
1010
   "source": [
1011
    "### Note Length Histogram"
1012
   ]
1013
  },
1014
  {
1015
   "cell_type": "code",
1016
   "execution_count": null,
1017
   "metadata": {
1018
    "ExecuteTime": {
1019
     "end_time": "2019-07-17T18:45:50.860829Z",
1020
     "start_time": "2019-07-17T18:45:49.137114Z"
1021
    }
1022
   },
1023
   "outputs": [],
1024
   "source": [
1025
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
1026
    "sns.distplot(df['note'].apply(len), kde=False, ax=ax, bins=100)\n",
1027
    "ax.set_xlabel('Length of Note (characters)')\n",
1028
    "ax.set_ylabel('# notes')"
1029
   ]
1030
  },
1031
  {
1032
   "cell_type": "code",
1033
   "execution_count": null,
1034
   "metadata": {
1035
    "ExecuteTime": {
1036
     "end_time": "2019-06-26T19:59:38.291139Z",
1037
     "start_time": "2019-06-26T19:59:38.267860Z"
1038
    }
1039
   },
1040
   "outputs": [],
1041
   "source": [
1042
    "# fig.savefig(args.figdir/'note_len_hist.tif', dpi=300)"
1043
   ]
1044
  },
1045
  {
1046
   "cell_type": "markdown",
1047
   "metadata": {},
1048
   "source": [
1049
    "### Imminent ICU Prediction Class Distribution"
1050
   ]
1051
  },
1052
  {
1053
   "cell_type": "code",
1054
   "execution_count": null,
1055
   "metadata": {
1056
    "ExecuteTime": {
1057
     "end_time": "2019-07-17T18:53:53.526861Z",
1058
     "start_time": "2019-07-17T18:53:53.429558Z"
1059
    }
1060
   },
1061
   "outputs": [],
1062
   "source": [
1063
    "desc = ['Unused', 'Delayed Admission', 'Imminent Admission']"
1064
   ]
1065
  },
1066
  {
1067
   "cell_type": "markdown",
1068
   "metadata": {},
1069
   "source": [
1070
    "#### Without Admissions"
1071
   ]
1072
  },
1073
  {
1074
   "cell_type": "code",
1075
   "execution_count": null,
1076
   "metadata": {
1077
    "ExecuteTime": {
1078
     "end_time": "2019-07-17T18:53:54.551036Z",
1079
     "start_time": "2019-07-17T18:53:54.423540Z"
1080
    }
1081
   },
1082
   "outputs": [],
1083
   "source": [
1084
    "plot_df = pd.DataFrame(df.groupby(['imminent_adm_label']).size(), columns=['n_notes']).reset_index()\n",
1085
    "plot_df['imminent_adm_label'] = desc\n",
1086
    "plot_df = plot_df.reindex([2, 1, 0])\n",
1087
    "plot_df.reset_index(inplace=True, drop=True)"
1088
   ]
1089
  },
1090
  {
1091
   "cell_type": "code",
1092
   "execution_count": null,
1093
   "metadata": {
1094
    "ExecuteTime": {
1095
     "end_time": "2019-07-17T18:53:55.840770Z",
1096
     "start_time": "2019-07-17T18:53:54.913513Z"
1097
    }
1098
   },
1099
   "outputs": [],
1100
   "source": [
1101
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
1102
    "sns.barplot(x='imminent_adm_label', y='n_notes', data=plot_df, ax=ax)\n",
1103
    "ax.set_xlabel('Imminent Class Label')\n",
1104
    "ax.set_ylabel('# notes')\n",
1105
    "for index, row in plot_df.iterrows():\n",
1106
    "  ax.text(index+0.05, row['n_notes']+50, str(row['n_notes']), color='black', ha='right', va='bottom')"
1107
   ]
1108
  },
1109
  {
1110
   "cell_type": "code",
1111
   "execution_count": null,
1112
   "metadata": {
1113
    "ExecuteTime": {
1114
     "end_time": "2019-06-27T01:07:18.818779Z",
1115
     "start_time": "2019-06-27T01:07:18.795768Z"
1116
    }
1117
   },
1118
   "outputs": [],
1119
   "source": [
1120
    "# fig.savefig(args.figdir/'imminent_label_bp.tif', dpi=300)"
1121
   ]
1122
  },
1123
  {
1124
   "cell_type": "markdown",
1125
   "metadata": {},
1126
   "source": [
1127
    "#### With Admissions"
1128
   ]
1129
  },
1130
  {
1131
   "cell_type": "code",
1132
   "execution_count": null,
1133
   "metadata": {
1134
    "ExecuteTime": {
1135
     "end_time": "2019-07-17T18:54:20.657113Z",
1136
     "start_time": "2019-07-17T18:54:20.298763Z"
1137
    }
1138
   },
1139
   "outputs": [],
1140
   "source": [
1141
    "p1 = pd.DataFrame(df.groupby(['imminent_adm_label']).size(), columns=['n_notes']).reset_index()\n",
1142
    "p2 = df.groupby(['imminent_adm_label'])['hadm_id'].nunique().reset_index()\n",
1143
    "p = p1.merge(p2, on=['imminent_adm_label'])"
1144
   ]
1145
  },
1146
  {
1147
   "cell_type": "code",
1148
   "execution_count": null,
1149
   "metadata": {
1150
    "ExecuteTime": {
1151
     "end_time": "2019-07-17T18:54:20.757964Z",
1152
     "start_time": "2019-07-17T18:54:20.660979Z"
1153
    }
1154
   },
1155
   "outputs": [],
1156
   "source": [
1157
    "p['imminent_adm_label'] = desc"
1158
   ]
1159
  },
1160
  {
1161
   "cell_type": "code",
1162
   "execution_count": null,
1163
   "metadata": {
1164
    "ExecuteTime": {
1165
     "end_time": "2019-07-17T18:54:21.287840Z",
1166
     "start_time": "2019-07-17T18:54:21.204792Z"
1167
    }
1168
   },
1169
   "outputs": [],
1170
   "source": [
1171
    "p = p.reindex([2,1,0])\n",
1172
    "p.reset_index(inplace=True, drop=True)\n",
1173
    "p"
1174
   ]
1175
  },
1176
  {
1177
   "cell_type": "code",
1178
   "execution_count": null,
1179
   "metadata": {
1180
    "ExecuteTime": {
1181
     "end_time": "2019-07-17T18:54:29.367296Z",
1182
     "start_time": "2019-07-17T18:54:29.263198Z"
1183
    }
1184
   },
1185
   "outputs": [],
1186
   "source": [
1187
    "plot_df = p.copy()\n",
1188
    "plot_df.rename(columns={'hadm_id':'# Admissions', 'n_notes':'# Notes'}, inplace=True)\n",
1189
    "plot_df = pd.melt(plot_df, id_vars='imminent_adm_label', var_name='Legend', value_name='counts')"
1190
   ]
1191
  },
1192
  {
1193
   "cell_type": "code",
1194
   "execution_count": null,
1195
   "metadata": {
1196
    "ExecuteTime": {
1197
     "end_time": "2019-07-17T18:54:35.592328Z",
1198
     "start_time": "2019-07-17T18:54:34.576044Z"
1199
    }
1200
   },
1201
   "outputs": [],
1202
   "source": [
1203
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
1204
    "\n",
1205
    "sns.barplot(x='imminent_adm_label', y='counts', hue='Legend', data=plot_df, ax=ax)\n",
1206
    "ax.set_xticklabels(ax.get_xticklabels(), ha='right')\n",
1207
    "ax.set_xlabel('Imminent Class Label')\n",
1208
    "ax.set_ylabel('# notes')\n",
1209
    "\n",
1210
    "for index, row in plot_df.iterrows():\n",
1211
    "    if index < len(plot_df)//2:\n",
1212
    "        ax.text(index-0.13, row['counts']+50, str(row['counts']), color='black', ha='right', va='bottom')\n",
1213
    "    else:\n",
1214
    "        ax.text(index % (len(plot_df)//2)+0.25, row['counts']+50, str(row['counts']), color='black', ha='right', va='bottom')"
1215
   ]
1216
  },
1217
  {
1218
   "cell_type": "code",
1219
   "execution_count": null,
1220
   "metadata": {},
1221
   "outputs": [],
1222
   "source": [
1223
    "# fig.savefig(args.figdir/'imminent_label_adms_bp.tif', dpi=300)"
1224
   ]
1225
  },
1226
  {
1227
   "cell_type": "markdown",
1228
   "metadata": {},
1229
   "source": [
1230
    "### Prolonged Stay Class Distribution"
1231
   ]
1232
  },
1233
  {
1234
   "cell_type": "code",
1235
   "execution_count": null,
1236
   "metadata": {
1237
    "ExecuteTime": {
1238
     "end_time": "2019-07-17T19:00:53.843117Z",
1239
     "start_time": "2019-07-17T19:00:53.541066Z"
1240
    }
1241
   },
1242
   "outputs": [],
1243
   "source": [
1244
    "desc = ['Short Stay', 'Prolonged Stay']"
1245
   ]
1246
  },
1247
  {
1248
   "cell_type": "markdown",
1249
   "metadata": {},
1250
   "source": [
1251
    "#### Without Admissions"
1252
   ]
1253
  },
1254
  {
1255
   "cell_type": "code",
1256
   "execution_count": null,
1257
   "metadata": {
1258
    "ExecuteTime": {
1259
     "end_time": "2019-07-17T19:01:08.738416Z",
1260
     "start_time": "2019-07-17T19:01:08.586921Z"
1261
    }
1262
   },
1263
   "outputs": [],
1264
   "source": [
1265
    "plot_df = pd.DataFrame(df.groupby(['prolonged_stay_label']).size(), columns=['n_notes']).reset_index()\n",
1266
    "plot_df['prolonged_stay_label'] = desc\n",
1267
    "plot_df = plot_df.reindex([1, 0])\n",
1268
    "plot_df.reset_index(inplace=True, drop=True)\n",
1269
    "plot_df"
1270
   ]
1271
  },
1272
  {
1273
   "cell_type": "code",
1274
   "execution_count": null,
1275
   "metadata": {
1276
    "ExecuteTime": {
1277
     "end_time": "2019-07-17T19:01:18.296482Z",
1278
     "start_time": "2019-07-17T19:01:17.775519Z"
1279
    }
1280
   },
1281
   "outputs": [],
1282
   "source": [
1283
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
1284
    "sns.barplot(x='prolonged_stay_label', y='n_notes', data=plot_df, ax=ax)\n",
1285
    "ax.set_xlabel('5 Day Discharge Class Label')\n",
1286
    "ax.set_ylabel('# notes')\n",
1287
    "for index, row in plot_df.iterrows():\n",
1288
    "  ax.text(index+0.05, row['n_notes']+50, str(row['n_notes']), color='black', ha='right', va='bottom')"
1289
   ]
1290
  },
1291
  {
1292
   "cell_type": "code",
1293
   "execution_count": null,
1294
   "metadata": {
1295
    "ExecuteTime": {
1296
     "end_time": "2019-06-30T21:09:10.237355Z",
1297
     "start_time": "2019-06-30T21:09:10.163Z"
1298
    }
1299
   },
1300
   "outputs": [],
1301
   "source": [
1302
    "# fig.savefig(args.figdir/'discharge_label_bp.tif', dpi=300)"
1303
   ]
1304
  },
1305
  {
1306
   "cell_type": "markdown",
1307
   "metadata": {},
1308
   "source": [
1309
    "#### With Admissions"
1310
   ]
1311
  },
1312
  {
1313
   "cell_type": "code",
1314
   "execution_count": null,
1315
   "metadata": {
1316
    "ExecuteTime": {
1317
     "end_time": "2019-07-17T19:01:34.791633Z",
1318
     "start_time": "2019-07-17T19:01:34.568783Z"
1319
    }
1320
   },
1321
   "outputs": [],
1322
   "source": [
1323
    "p1 = pd.DataFrame(df.groupby(['prolonged_stay_label']).size(), columns=['n_notes']).reset_index()\n",
1324
    "p2 = df.groupby(['prolonged_stay_label'])['hadm_id'].nunique().reset_index()\n",
1325
    "p = p1.merge(p2, on=['prolonged_stay_label'])\n",
1326
    "p['prolonged_stay_label'] = desc\n",
1327
    "p = p.reindex([1,0])\n",
1328
    "p.reset_index(inplace=True, drop=True)\n",
1329
    "p"
1330
   ]
1331
  },
1332
  {
1333
   "cell_type": "code",
1334
   "execution_count": null,
1335
   "metadata": {
1336
    "ExecuteTime": {
1337
     "end_time": "2019-07-17T19:01:42.249351Z",
1338
     "start_time": "2019-07-17T19:01:42.137270Z"
1339
    }
1340
   },
1341
   "outputs": [],
1342
   "source": [
1343
    "plot_df = p.copy()\n",
1344
    "plot_df.rename(columns={'hadm_id':'# Admissions', 'n_notes':'# Notes'}, inplace=True)\n",
1345
    "plot_df = pd.melt(plot_df, id_vars='prolonged_stay_label', var_name='Legend', value_name='counts')"
1346
   ]
1347
  },
1348
  {
1349
   "cell_type": "code",
1350
   "execution_count": null,
1351
   "metadata": {
1352
    "ExecuteTime": {
1353
     "end_time": "2019-07-17T19:01:47.756030Z",
1354
     "start_time": "2019-07-17T19:01:47.553253Z"
1355
    }
1356
   },
1357
   "outputs": [],
1358
   "source": [
1359
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
1360
    "\n",
1361
    "sns.barplot(x='prolonged_stay_label', y='counts', hue='Legend', data=plot_df, ax=ax)\n",
1362
    "ax.set_xticklabels(ax.get_xticklabels(), ha='right')\n",
1363
    "ax.set_xlabel('5 Day Discharge Class Label')\n",
1364
    "ax.set_ylabel('# notes')\n",
1365
    "\n",
1366
    "for index, row in plot_df.iterrows():\n",
1367
    "    if index < len(plot_df)//2:\n",
1368
    "        ax.text(index-0.13, row['counts']+50, str(row['counts']), color='black', ha='right', va='bottom')\n",
1369
    "    else:\n",
1370
    "        ax.text(index % (len(plot_df)//2)+0.25, row['counts']+50, str(row['counts']), color='black', ha='right', va='bottom')"
1371
   ]
1372
  },
1373
  {
1374
   "cell_type": "code",
1375
   "execution_count": null,
1376
   "metadata": {},
1377
   "outputs": [],
1378
   "source": [
1379
    "# fig.savefig(args.figdir/'discharge_label_adms_bp.tif', dpi=300)"
1380
   ]
1381
  }
1382
 ],
1383
 "metadata": {
1384
  "kernelspec": {
1385
   "display_name": "Python 3",
1386
   "language": "python",
1387
   "name": "python3"
1388
  },
1389
  "language_info": {
1390
   "codemirror_mode": {
1391
    "name": "ipython",
1392
    "version": 3
1393
   },
1394
   "file_extension": ".py",
1395
   "mimetype": "text/x-python",
1396
   "name": "python",
1397
   "nbconvert_exporter": "python",
1398
   "pygments_lexer": "ipython3",
1399
   "version": "3.7.4"
1400
  },
1401
  "toc": {
1402
   "base_numbering": 1,
1403
   "nav_menu": {},
1404
   "number_sections": true,
1405
   "sideBar": true,
1406
   "skip_h1_title": true,
1407
   "title_cell": "Table of Contents",
1408
   "title_sidebar": "Contents",
1409
   "toc_cell": false,
1410
   "toc_position": {
1411
    "height": "calc(100% - 180px)",
1412
    "left": "10px",
1413
    "top": "150px",
1414
    "width": "165px"
1415
   },
1416
   "toc_section_display": true,
1417
   "toc_window_display": false
1418
  }
1419
 },
1420
 "nbformat": 4,
1421
 "nbformat_minor": 2
1422
}