--- a
+++ b/analysis/data_analysis.ipynb
@@ -0,0 +1,1422 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dataset Preparation for Prediction of Imminent ICU Admission and Prolonged Stay"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports & Inits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-08-26T18:54:33.858025Z",
+     "start_time": "2019-08-26T18:54:33.683791Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-08-26T18:54:33.919510Z",
+     "start_time": "2019-08-26T18:54:33.887213Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'workdir': PosixPath('../data/workdir'),\n",
+       " 'figdir': PosixPath('../data/results/figures'),\n",
+       " 'resultdir': PosixPath('../data/results'),\n",
+       " 'dataset_csv': PosixPath('../data/proc_dataset.csv'),\n",
+       " 'imminent_adm_cols': ['hadm_id', 'processed_note', 'imminent_adm_label'],\n",
+       " 'prolonged_stay_cols': ['hadm_id', 'processed_note', 'prolonged_stay_label'],\n",
+       " 'cols': ['hadm_id',\n",
+       "  'imminent_adm_label',\n",
+       "  'prolonged_stay_label',\n",
+       "  'processed_note',\n",
+       "  'charttime',\n",
+       "  'intime',\n",
+       "  'chartinterval'],\n",
+       " 'dates': ['charttime', 'intime'],\n",
+       " 'ia_thresh': {'lr': 0.45, 'rf': 0.27, 'gbm': 0.435, 'mlp': 0.2},\n",
+       " 'ps_thresh': {'lr': 0.39, 'rf': 0.36, 'gbm': 0.324, 'mlp': 0.27}}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "sys.path.append('../')\n",
+    "\n",
+    "import math\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import spacy\n",
+    "\n",
+    "import seaborn as sns\n",
+    "sns.set(style = 'darkgrid')\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "\n",
+    "from scipy import stats\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from utils.splits import set_group_splits\n",
+    "from args import args\n",
+    "vars(args)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "heading_collapsed": true
+   },
+   "source": [
+    "## Stats"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:48:22.330446Z",
+     "start_time": "2019-07-17T18:48:17.056668Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(args.dataset_csv)\n",
+    "ia_df = df.loc[(df['imminent_adm_label'] != -1)][args.imminent_adm_cols].reset_index(drop=True)\n",
+    "ps_df = ori_df.loc[(ori_df['chartinterval'] != 0)][args.prolonged_stay_cols].reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:48:22.402048Z",
+     "start_time": "2019-07-17T18:48:22.333813Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "df['subject_id'].nunique(), df['hadm_id'].nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:48:29.519915Z",
+     "start_time": "2019-07-17T18:48:29.443575Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "ages = df.groupby(['subject_id'])[['admission_age']].first().to_numpy().reshape(-1)\n",
+    "ages[ages>100] = 100\n",
+    "print(f\"Median age: {ages.mean():0.1f}\")\n",
+    "print(f\"IQR: {np.percentile(ages, 25):0.1f} - {np.percentile(ages, 75):0.1f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:48:53.210075Z",
+     "start_time": "2019-07-17T18:48:53.002656Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "df['adm_to_icu_period'].describe().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:49:21.918238Z",
+     "start_time": "2019-07-17T18:49:21.653850Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "df.groupby(df['admission_type'])['hadm_id'].nunique().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:49:22.411510Z",
+     "start_time": "2019-07-17T18:49:22.249868Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "df.groupby(df['ethnicity'])['subject_id'].nunique().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "hidden": true
+   },
+   "source": [
+    "Make sure average prevalence of random test sets is approximately same as real prevalence"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:27:25.090684Z",
+     "start_time": "2019-07-17T18:27:05.095345Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "ia_p = []\n",
+    "ps_p = []\n",
+    "\n",
+    "for seed in range(127, 227):\n",
+    "  sdf = set_group_splits(ia_df.copy(), group_col='hadm_id', seed=seed)\n",
+    "  test_size = len(sdf.loc[(sdf['split'] == 'test')])\n",
+    "  test_pos = len(sdf.loc[(sdf['split'] == 'test') & (sdf['imminent_adm_label'] == 1)])\n",
+    "  ia_p.append(test_pos/test_size)  \n",
+    "  \n",
+    "  sdf = set_group_splits(ps_df.copy(), group_col='hadm_id', seed=seed)\n",
+    "  test_size = len(sdf.loc[(sdf['split'] == 'test')])\n",
+    "  test_pos = len(sdf.loc[(sdf['split'] == 'test') & (sdf['prolonged_stay_label'] == 1)])\n",
+    "  ps_p.append(test_pos/test_size)  \n",
+    "  \n",
+    "\n",
+    "ia_p = np.array(ia_p)\n",
+    "ps_p = np.array(ps_p)\n",
+    "\n",
+    "print(f\"Prevalence of Imminent Admission: {(len(ia_df.loc[ia_df['imminent_adm_label'] == 1])/len(ia_df)):0.3f}\")\n",
+    "print(f\"Average of test set = {(ia_p.mean()):0.3f}, std = {(ia_p.std()):0.3f}\")\n",
+    "print(f\"Prevalence of Prolonged Stay: {(len(ps_df.loc[ps_df['prolonged_stay_label'] == 1])/len(ps_df)):0.3f}\")\n",
+    "print(f\"Average of test set = {(ps_p.mean()):0.3f}, std = {(ps_p.std()):0.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:49:26.743454Z",
+     "start_time": "2019-07-17T18:49:26.652037Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "print(f\"Average number of notes per admission for imminent admission: {ia_df.groupby('hadm_id').size().mean():0.2f}\")\n",
+    "print(f\"Average number of notes per admission for prolonged stay (and entire dataset): {ps_df.groupby('hadm_id').size().mean():0.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:49:48.377622Z",
+     "start_time": "2019-07-17T18:49:48.020027Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "df.groupby(df['deathtime'].apply(lambda x: True if pd.notnull(x) else False))['subject_id'].nunique().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:49:49.030699Z",
+     "start_time": "2019-07-17T18:49:48.901279Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "df.groupby(df['gender'])['subject_id'].nunique().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "hidden": true
+   },
+   "source": [
+    "Distribution of notes by category"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:49:51.015549Z",
+     "start_time": "2019-07-17T18:49:50.883550Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "df.groupby(df['category']).size().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "hidden": true
+   },
+   "source": [
+    "Distribution of notes by category for imminent admissions and delayed admissions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:40:35.638983Z",
+     "start_time": "2019-07-17T18:40:35.428118Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "df.loc[(df['imminent_adm_label'] == 1)].groupby('category').size().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:42:29.970948Z",
+     "start_time": "2019-07-17T18:42:29.704004Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "df.loc[(df['imminent_adm_label'] == 0)].groupby('category').size().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "hidden": true
+   },
+   "source": [
+    "Distribution of notes for prolonged stay and short stay"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:42:30.199045Z",
+     "start_time": "2019-07-17T18:42:29.974531Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "df.loc[(df['prolonged_stay_label'] == 1)].groupby('category').size().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:42:30.334632Z",
+     "start_time": "2019-07-17T18:42:30.202847Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "df.loc[(df['prolonged_stay_label'] == 0)].groupby('category').size().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:49:58.769470Z",
+     "start_time": "2019-07-17T18:49:58.658678Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "df['icu_los'].describe().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:52:01.571726Z",
+     "start_time": "2019-07-17T18:52:01.262084Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "df['note'].apply(len).describe().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:42:23.454748Z",
+     "start_time": "2019-07-17T18:42:23.043855Z"
+    },
+    "hidden": true
+   },
+   "outputs": [],
+   "source": [
+    "df['charttime_to_icu_period'].describe().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Plots"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-08-26T18:54:53.518650Z",
+     "start_time": "2019-08-26T18:54:50.602313Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['subject_id', 'hadm_id', 'icustay_id', 'admission_type', 'admittime',\n",
+       "       'dischtime', 'intime', 'outtime', 'charttime', 'icu_los', 'deathtime',\n",
+       "       'adm_to_icu_period', 'charttime_to_icu_period', 'chartinterval',\n",
+       "       'ethnicity', 'dob', 'gender', 'admission_age', 'category',\n",
+       "       'imminent_adm_label', 'prolonged_stay_label', 'note', 'processed_note'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(args.dataset_csv)\n",
+    "df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-08-26T18:56:35.688519Z",
+     "start_time": "2019-08-26T18:56:35.448942Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Radiology\n",
+      "VEN DUP EXTEXT BIL (MAP/DVT)\n",
+      "[**2162-5-17**] 8:12 AM\n",
+      " [**Last Name (un) 1296**] DUP EXTEXT BIL (MAP/DVT)                                    Clip # [**Clip Number (Radiology) 18833**]\n",
+      " Reason: eval for vein harvesting*****OR second case on [**2162-5-17**]******\n",
+      " Admitting Diagnosis: CORONARY ARTERY DISEASE\n",
+      " ______________________________________________________________________________\n",
+      " [**Hospital 2**] MEDICAL CONDITION:\n",
+      "  60 year old man pre-op for CABG\n",
+      " REASON FOR THIS EXAMINATION:\n",
+      "  eval for vein harvesting*****OR second case on [**2162-5-17**]********\n",
+      " ______________________________________________________________________________\n",
+      "                                 FINAL REPORT\n",
+      " HISTORY:  A 60-year-old gentleman, preop for CABG.  Search for conduit.\n",
+      "\n",
+      " TECHNIQUE:  Venous mapping of the superficial veins in the lower extremities\n",
+      " was performed with [**Doctor Last Name 37**]-scale and Doppler ultrasound.\n",
+      "\n",
+      " FINDINGS:  Right great saphenous vein is patent and compressible with\n",
+      " diameters ranging between 0.15 and 0.21 cm.  The right small saphenous vein is\n",
+      " patent and compressible with diameters ranging between 0.14 and 0.20 cm.\n",
+      "\n",
+      " The left great saphenous vein presented with diameters ranging between 0.06\n",
+      " and 0.49 cm.  It was not visualized below the calf.  The left small saphenous\n",
+      " vein presented thick walled and calcified.\n",
+      "\n",
+      " COMPARISON:  None available.\n",
+      "\n",
+      " IMPRESSION:  Patent right great and small saphenous veins, with diameters\n",
+      " described above.  Left great saphenous vein with small diameters below the mid\n",
+      " thigh and not visualized below the calf.  The left small saphenous vein\n",
+      " presented with thick walls and calcifications.\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df.iloc[0]['note'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-08-26T18:56:41.114437Z",
+     "start_time": "2019-08-26T18:56:41.086224Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Radiology \n",
+      " VEN DUP EXTEXT BIL ( MAP/DVT ) \n",
+      " [ * * 2162 - 5 - 17 * * ] 8:12 AM \n",
+      "  [ * * Last Name ( un ) 1296 * * ] DUP EXTEXT BIL ( MAP/DVT )                                     Clip # [ * * Clip Number ( Radiology ) 18833 * * ] \n",
+      "  Reason : eval for vein harvesting*****OR second case on [ * * 2162 - 5 - 17 * * ] * * * * * * \n",
+      "  Admitting Diagnosis : CORONARY ARTERY DISEASE \n",
+      "  _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n",
+      "  [ * * Hospital 2 * * ] MEDICAL CONDITION : \n",
+      "   60 year old man pre-op for CABG \n",
+      "  REASON FOR THIS EXAMINATION : \n",
+      "   eval for vein harvesting*****OR second case on [ * * 2162 - 5 - 17 * * ] * * * * * * * * \n",
+      "  _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n",
+      "                                  FINAL REPORT \n",
+      "  HISTORY :   A 60-year-old gentleman , preop for CABG .   Search for conduit . \n",
+      "\n",
+      "  TECHNIQUE :   Venous mapping of the superficial veins in the lower extremities \n",
+      "  was performed with [ * * Doctor Last Name 37**]-scale and Doppler ultrasound . \n",
+      "\n",
+      "  FINDINGS :   Right great saphenous vein is patent and compressible with \n",
+      "  diameters ranging between 0.15 and 0.21 cm .   The right small saphenous vein is \n",
+      "  patent and compressible with diameters ranging between 0.14 and 0.20 cm . \n",
+      "\n",
+      "  The left great saphenous vein presented with diameters ranging between 0.06 \n",
+      "  and 0.49 cm .   It was not visualized below the calf .   The left small saphenous \n",
+      "  vein presented thick walled and calcified . \n",
+      "\n",
+      "  COMPARISON :   None available . \n",
+      "\n",
+      "  IMPRESSION :   Patent right great and small saphenous veins , with diameters \n",
+      "  described above .   Left great saphenous vein with small diameters below the mid \n",
+      "  thigh and not visualized below the calf .   The left small saphenous vein \n",
+      "  presented with thick walls and calcifications . \n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df.iloc[0]['processed_note'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:42:36.032162Z",
+     "start_time": "2019-07-17T18:42:36.008573Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "intervals = ['-1 ≤ t ≤ 0']\n",
+    "intervals += [f'-{i+1} ≤ t ≤ -{i}' for i in range(1, 15)]\n",
+    "intervals.append('t ≤ -15')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Bar Plot of Notes Over Days"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### All Notes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:42:38.585605Z",
+     "start_time": "2019-07-17T18:42:38.475792Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plot_df = pd.DataFrame(df.groupby(['chartinterval']).size(), columns=['n_notes'])\n",
+    "plot_df.reset_index(inplace=True, drop=True)\n",
+    "plot_df['days'] = intervals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:42:41.669474Z",
+     "start_time": "2019-07-17T18:42:40.376671Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "fig, ax = plt.subplots(figsize=(15, 8))\n",
+    "sns.barplot(x='days', y='n_notes', data=plot_df, ax=ax)\n",
+    "ax.set_xticklabels(ax.get_xticklabels(),rotation=45, ha='right')\n",
+    "ax.set_xlabel('Time to ICU Admission (days)')\n",
+    "ax.set_ylabel('# notes')\n",
+    "for index, row in plot_df.iterrows():\n",
+    "    ax.text(index, row['n_notes'], str(row['n_notes']), color='black', ha='center', va='bottom')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-27T00:38:13.014421Z",
+     "start_time": "2019-06-27T00:38:12.991010Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# fig.savefig(args.figdir/'note_bp.tif', dpi=300)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### By Category"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:43:10.933200Z",
+     "start_time": "2019-07-17T18:43:10.805412Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def plot_intervals(ax, df, cat):\n",
+    "  sns.barplot(x='days', y='n_notes', data=df, ax=ax)\n",
+    "  ax.set_xticklabels(ax.get_xticklabels(),rotation=45, ha='right')\n",
+    "  ax.set_xlabel('')\n",
+    "  ax.set_ylabel('')\n",
+    "  ax.set_title(f\"Note Category: {cat}\\n# notes: {df['n_notes'].sum()}\")   \n",
+    "\n",
+    "  for index, (_, row) in enumerate(df.iterrows()):\n",
+    "      ax.text(index, row['n_notes'], str(row['n_notes']), color='black', ha='center', va='bottom')    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:43:12.728434Z",
+     "start_time": "2019-07-17T18:43:12.610095Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plot_df = pd.DataFrame(df.groupby(['category', 'chartinterval']).size(), columns=['n_notes'])\n",
+    "plot_df.reset_index(inplace=True)\n",
+    "plot_df['days'] = plot_df['chartinterval'].apply(lambda x: intervals[x])\n",
+    "plot_df.drop(['chartinterval'], inplace=True, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:43:26.011143Z",
+     "start_time": "2019-07-17T18:43:15.024678Z"
+    },
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "fig, ax = plt.subplots(6, 2, figsize=(20, 50))\n",
+    "plot_intervals(ax[0][0], plot_df.loc[plot_df['category'] == 'Case Management ', ['n_notes', 'days']], 'Case Management')\n",
+    "plot_intervals(ax[0][1], plot_df.loc[plot_df['category'] == 'Consult', ['n_notes', 'days']], 'Consult')\n",
+    "\n",
+    "plot_intervals(ax[1][0], plot_df.loc[plot_df['category'] == 'General', ['n_notes', 'days']], 'General')\n",
+    "plot_intervals(ax[1][1], plot_df.loc[plot_df['category'] == 'Nursing', ['n_notes', 'days']], 'Nursing')\n",
+    "\n",
+    "plot_intervals(ax[2][0], plot_df.loc[plot_df['category'] == 'Nursing/other', ['n_notes', 'days']], 'Nursing/other')\n",
+    "plot_intervals(ax[2][1], plot_df.loc[plot_df['category'] == 'Nutrition', ['n_notes', 'days']], 'Nutrition')\n",
+    "\n",
+    "plot_intervals(ax[3][0], plot_df.loc[plot_df['category'] == 'Pharmacy', ['n_notes', 'days']], 'Pharmacy')\n",
+    "plot_intervals(ax[3][1], plot_df.loc[plot_df['category'] == 'Physician ', ['n_notes', 'days',]], 'Physician')\n",
+    "\n",
+    "plot_intervals(ax[4][0], plot_df.loc[plot_df['category'] == 'Radiology', ['n_notes', 'days']], 'Radiology')\n",
+    "plot_intervals(ax[4][1], plot_df.loc[plot_df['category'] == 'Rehab Services', ['n_notes', 'days']], 'Rehab Services')\n",
+    "\n",
+    "plot_intervals(ax[5][0], plot_df.loc[plot_df['category'] == 'Respiratory ', ['n_notes', 'days']], 'Respiratory')\n",
+    "plot_intervals(ax[5][1], plot_df.loc[plot_df['category'] == 'Social Work', ['n_notes', 'days']], 'Social Work')\n",
+    "\n",
+    "fig.text(0.5, 0.1, 'Time to ICU Admission (days)', ha='center')\n",
+    "fig.text(0.08, 0.5, '# notes', va='center', rotation='vertical')\n",
+    "\n",
+    "plt.subplots_adjust(hspace = 0.3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-27T00:42:13.420913Z",
+     "start_time": "2019-06-27T00:42:13.395654Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# cats = sorted(list(df['category'].unique()))\n",
+    "\n",
+    "# n = 0\n",
+    "# fig, ax = plt.subplots(1, 1, figsize=(10, 8))\n",
+    "# plot_intervals(ax, plot_df.loc[plot_df['category'] == cats[n], ['n_notes', 'days']], cats[n])\n",
+    "# ax.set_xlabel('Time to ICU Admission (days)')\n",
+    "# ax.set_ylabel('# notes')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-26T19:55:34.228896Z",
+     "start_time": "2019-06-26T19:55:34.204962Z"
+    },
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "# fig.savefig(args.figdir/'note_cats_bp.tif', dpi=300)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Note Chart Time to ICU Admission Period Histogram"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### All Notes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:43:56.456943Z",
+     "start_time": "2019-07-17T18:43:56.330361Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plot_df = df[['category', 'charttime_to_icu_period']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:44:05.284861Z",
+     "start_time": "2019-07-17T18:44:03.949948Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "fig, ax = plt.subplots(figsize=(10, 8))\n",
+    "\n",
+    "sns.distplot(plot_df['charttime_to_icu_period'], kde=False, ax=ax, bins=80)\n",
+    "ax.set_xlabel('Period between Note Chart Time and ICU Admission Time (days)')\n",
+    "ax.set_ylabel('# notes')\n",
+    "ax.set_xlim(0, 60)\n",
+    "\n",
+    "# ax.text(ax.get_xlim()[1]*0.50, ax.get_ylim()[1]*0.80, f\"Min: {mdf['chart_icu_period'].min()}, Avg: {mdf['chart_icu_period'].mean(): 0.2f}, Max: {mdf['chart_icu_period'].max()}\", fontweight='bold', fontsize=15, ha='center', va='bottom')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-26T19:55:34.712151Z",
+     "start_time": "2019-06-26T19:55:34.686551Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# fig.savefig(args.figdir/'note_icu_period_hist.tif', dpi=300)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### By Category"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:44:13.676427Z",
+     "start_time": "2019-07-17T18:44:13.571725Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def plot_period(ax, df, cat):\n",
+    "  sns.distplot(df, kde=False, ax=ax, bins=10)\n",
+    "  ax.set_xlabel('')\n",
+    "  ax.set_ylabel('')\n",
+    "  ax.set_title(f\"Note Category: {cat}\")   "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:45:21.689010Z",
+     "start_time": "2019-07-17T18:45:12.353337Z"
+    },
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "fig, ax = plt.subplots(6, 2, figsize=(20, 50))\n",
+    "plot_period(ax[0][0], plot_df.loc[plot_df['category'] == 'Case Management ', ['charttime_to_icu_period']], 'Case Management')\n",
+    "plot_period(ax[0][1], plot_df.loc[plot_df['category'] == 'Consult', ['charttime_to_icu_period']], 'Consult')\n",
+    "\n",
+    "plot_period(ax[1][0], plot_df.loc[plot_df['category'] == 'General', ['charttime_to_icu_period']], 'General')\n",
+    "plot_period(ax[1][1], plot_df.loc[plot_df['category'] == 'Nursing', ['charttime_to_icu_period']], 'Nursing')\n",
+    "\n",
+    "plot_period(ax[2][0], plot_df.loc[plot_df['category'] == 'Nursing/other', ['charttime_to_icu_period']], 'Nursing/other')\n",
+    "plot_period(ax[2][1], plot_df.loc[plot_df['category'] == 'Nutrition', ['charttime_to_icu_period']], 'Nutrition')\n",
+    "\n",
+    "plot_period(ax[3][0], plot_df.loc[plot_df['category'] == 'Pharmacy', ['charttime_to_icu_period']], 'Pharmacy')\n",
+    "plot_period(ax[3][1], plot_df.loc[plot_df['category'] == 'Physician ', ['charttime_to_icu_period',]], 'Physician')\n",
+    "\n",
+    "plot_period(ax[4][0], plot_df.loc[plot_df['category'] == 'Radiology', ['charttime_to_icu_period']], 'Radiology')\n",
+    "plot_period(ax[4][1], plot_df.loc[plot_df['category'] == 'Rehab Services', ['charttime_to_icu_period']], 'Rehab Services')\n",
+    "\n",
+    "plot_period(ax[5][0], plot_df.loc[plot_df['category'] == 'Respiratory ', ['charttime_to_icu_period']], 'Respiratory')\n",
+    "plot_period(ax[5][1], plot_df.loc[plot_df['category'] == 'Social Work', ['charttime_to_icu_period']], 'Social Work')\n",
+    "\n",
+    "fig.text(0.5, 0.11, 'Period between Note Chart Time and ICU Admission Time (days)', ha='center')\n",
+    "fig.text(0.08, 0.5, '# notes', va='center', rotation='vertical')\n",
+    "\n",
+    "plt.subplots_adjust(hspace = 0.1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-27T00:43:24.745337Z",
+     "start_time": "2019-06-27T00:43:24.720208Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# cats = sorted(list(df['category'].unique()))\n",
+    "\n",
+    "# n = 0\n",
+    "# fig, ax = plt.subplots(1, 1, figsize=(10, 8))\n",
+    "# plot_period(ax, plot_df.loc[plot_df['category'] == cats[n], ['chart_icu_period']], cats[n])\n",
+    "# ax.set_xlabel('Time to ICU Admission (days)')\n",
+    "# ax.set_ylabel('# notes')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-26T19:35:38.476961Z",
+     "start_time": "2019-06-26T19:35:38.451886Z"
+    },
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "# fig.savefig(args.figdir/'note_cat_icu_period_hist.tif', dpi=300)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Hospital Admission to ICU Admission Period Histogram"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:45:44.547021Z",
+     "start_time": "2019-07-17T18:45:44.519812Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plot_df = df[['adm_to_icu_period']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:45:46.580796Z",
+     "start_time": "2019-07-17T18:45:45.217784Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "fig, ax = plt.subplots(figsize=(10, 8))\n",
+    "\n",
+    "sns.distplot(plot_df, kde=False, ax=ax, bins=80)\n",
+    "ax.set_xlabel('Time between hospital admission and ICU admission (days)')\n",
+    "ax.set_ylabel('# notes')\n",
+    "ax.set_xlim(0, 70)\n",
+    "# ax.text(ax.get_xlim()[1]*0.50, ax.get_ylim()[1]*0.80, f\"Min: {mdf['adm_icu_period'].min()}, Avg: {mdf['adm_icu_period'].mean(): 0.2f}, Max: {mdf['adm_icu_period'].max()}\", fontweight='bold', fontsize=15, ha='center', va='bottom')    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fig.savefig(args.figdir/'adm_icu_period_hist.tif', dpi=300)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Note Length Histogram"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:45:50.860829Z",
+     "start_time": "2019-07-17T18:45:49.137114Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "fig, ax = plt.subplots(figsize=(10, 8))\n",
+    "sns.distplot(df['note'].apply(len), kde=False, ax=ax, bins=100)\n",
+    "ax.set_xlabel('Length of Note (characters)')\n",
+    "ax.set_ylabel('# notes')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-26T19:59:38.291139Z",
+     "start_time": "2019-06-26T19:59:38.267860Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# fig.savefig(args.figdir/'note_len_hist.tif', dpi=300)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Imminent ICU Prediction Class Distribution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:53:53.526861Z",
+     "start_time": "2019-07-17T18:53:53.429558Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "desc = ['Unused', 'Delayed Admission', 'Imminent Admission']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Without Admissions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:53:54.551036Z",
+     "start_time": "2019-07-17T18:53:54.423540Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plot_df = pd.DataFrame(df.groupby(['imminent_adm_label']).size(), columns=['n_notes']).reset_index()\n",
+    "plot_df['imminent_adm_label'] = desc\n",
+    "plot_df = plot_df.reindex([2, 1, 0])\n",
+    "plot_df.reset_index(inplace=True, drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:53:55.840770Z",
+     "start_time": "2019-07-17T18:53:54.913513Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "fig, ax = plt.subplots(figsize=(10, 8))\n",
+    "sns.barplot(x='imminent_adm_label', y='n_notes', data=plot_df, ax=ax)\n",
+    "ax.set_xlabel('Imminent Class Label')\n",
+    "ax.set_ylabel('# notes')\n",
+    "for index, row in plot_df.iterrows():\n",
+    "  ax.text(index+0.05, row['n_notes']+50, str(row['n_notes']), color='black', ha='right', va='bottom')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-27T01:07:18.818779Z",
+     "start_time": "2019-06-27T01:07:18.795768Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# fig.savefig(args.figdir/'imminent_label_bp.tif', dpi=300)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### With Admissions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:54:20.657113Z",
+     "start_time": "2019-07-17T18:54:20.298763Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "p1 = pd.DataFrame(df.groupby(['imminent_adm_label']).size(), columns=['n_notes']).reset_index()\n",
+    "p2 = df.groupby(['imminent_adm_label'])['hadm_id'].nunique().reset_index()\n",
+    "p = p1.merge(p2, on=['imminent_adm_label'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:54:20.757964Z",
+     "start_time": "2019-07-17T18:54:20.660979Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "p['imminent_adm_label'] = desc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:54:21.287840Z",
+     "start_time": "2019-07-17T18:54:21.204792Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "p = p.reindex([2,1,0])\n",
+    "p.reset_index(inplace=True, drop=True)\n",
+    "p"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:54:29.367296Z",
+     "start_time": "2019-07-17T18:54:29.263198Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plot_df = p.copy()\n",
+    "plot_df.rename(columns={'hadm_id':'# Admissions', 'n_notes':'# Notes'}, inplace=True)\n",
+    "plot_df = pd.melt(plot_df, id_vars='imminent_adm_label', var_name='Legend', value_name='counts')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T18:54:35.592328Z",
+     "start_time": "2019-07-17T18:54:34.576044Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "fig, ax = plt.subplots(figsize=(10, 8))\n",
+    "\n",
+    "sns.barplot(x='imminent_adm_label', y='counts', hue='Legend', data=plot_df, ax=ax)\n",
+    "ax.set_xticklabels(ax.get_xticklabels(), ha='right')\n",
+    "ax.set_xlabel('Imminent Class Label')\n",
+    "ax.set_ylabel('# notes')\n",
+    "\n",
+    "for index, row in plot_df.iterrows():\n",
+    "    if index < len(plot_df)//2:\n",
+    "        ax.text(index-0.13, row['counts']+50, str(row['counts']), color='black', ha='right', va='bottom')\n",
+    "    else:\n",
+    "        ax.text(index % (len(plot_df)//2)+0.25, row['counts']+50, str(row['counts']), color='black', ha='right', va='bottom')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fig.savefig(args.figdir/'imminent_label_adms_bp.tif', dpi=300)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Prolonged Stay Class Distribution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T19:00:53.843117Z",
+     "start_time": "2019-07-17T19:00:53.541066Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "desc = ['Short Stay', 'Prolonged Stay']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Without Admissions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T19:01:08.738416Z",
+     "start_time": "2019-07-17T19:01:08.586921Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plot_df = pd.DataFrame(df.groupby(['prolonged_stay_label']).size(), columns=['n_notes']).reset_index()\n",
+    "plot_df['prolonged_stay_label'] = desc\n",
+    "plot_df = plot_df.reindex([1, 0])\n",
+    "plot_df.reset_index(inplace=True, drop=True)\n",
+    "plot_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T19:01:18.296482Z",
+     "start_time": "2019-07-17T19:01:17.775519Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "fig, ax = plt.subplots(figsize=(10, 8))\n",
+    "sns.barplot(x='prolonged_stay_label', y='n_notes', data=plot_df, ax=ax)\n",
+    "ax.set_xlabel('5 Day Discharge Class Label')\n",
+    "ax.set_ylabel('# notes')\n",
+    "for index, row in plot_df.iterrows():\n",
+    "  ax.text(index+0.05, row['n_notes']+50, str(row['n_notes']), color='black', ha='right', va='bottom')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-06-30T21:09:10.237355Z",
+     "start_time": "2019-06-30T21:09:10.163Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# fig.savefig(args.figdir/'discharge_label_bp.tif', dpi=300)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### With Admissions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T19:01:34.791633Z",
+     "start_time": "2019-07-17T19:01:34.568783Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "p1 = pd.DataFrame(df.groupby(['prolonged_stay_label']).size(), columns=['n_notes']).reset_index()\n",
+    "p2 = df.groupby(['prolonged_stay_label'])['hadm_id'].nunique().reset_index()\n",
+    "p = p1.merge(p2, on=['prolonged_stay_label'])\n",
+    "p['prolonged_stay_label'] = desc\n",
+    "p = p.reindex([1,0])\n",
+    "p.reset_index(inplace=True, drop=True)\n",
+    "p"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T19:01:42.249351Z",
+     "start_time": "2019-07-17T19:01:42.137270Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plot_df = p.copy()\n",
+    "plot_df.rename(columns={'hadm_id':'# Admissions', 'n_notes':'# Notes'}, inplace=True)\n",
+    "plot_df = pd.melt(plot_df, id_vars='prolonged_stay_label', var_name='Legend', value_name='counts')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2019-07-17T19:01:47.756030Z",
+     "start_time": "2019-07-17T19:01:47.553253Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "fig, ax = plt.subplots(figsize=(10, 8))\n",
+    "\n",
+    "sns.barplot(x='prolonged_stay_label', y='counts', hue='Legend', data=plot_df, ax=ax)\n",
+    "ax.set_xticklabels(ax.get_xticklabels(), ha='right')\n",
+    "ax.set_xlabel('5 Day Discharge Class Label')\n",
+    "ax.set_ylabel('# notes')\n",
+    "\n",
+    "for index, row in plot_df.iterrows():\n",
+    "    if index < len(plot_df)//2:\n",
+    "        ax.text(index-0.13, row['counts']+50, str(row['counts']), color='black', ha='right', va='bottom')\n",
+    "    else:\n",
+    "        ax.text(index % (len(plot_df)//2)+0.25, row['counts']+50, str(row['counts']), color='black', ha='right', va='bottom')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fig.savefig(args.figdir/'discharge_label_adms_bp.tif', dpi=300)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": true,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {
+    "height": "calc(100% - 180px)",
+    "left": "10px",
+    "top": "150px",
+    "width": "165px"
+   },
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}