1423 lines (1422 with data), 41.0 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dataset Preparation for Prediction of Imminent ICU Admission and Prolonged Stay"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports & Inits"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2019-08-26T18:54:33.858025Z",
"start_time": "2019-08-26T18:54:33.683791Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2019-08-26T18:54:33.919510Z",
"start_time": "2019-08-26T18:54:33.887213Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"{'workdir': PosixPath('../data/workdir'),\n",
" 'figdir': PosixPath('../data/results/figures'),\n",
" 'resultdir': PosixPath('../data/results'),\n",
" 'dataset_csv': PosixPath('../data/proc_dataset.csv'),\n",
" 'imminent_adm_cols': ['hadm_id', 'processed_note', 'imminent_adm_label'],\n",
" 'prolonged_stay_cols': ['hadm_id', 'processed_note', 'prolonged_stay_label'],\n",
" 'cols': ['hadm_id',\n",
" 'imminent_adm_label',\n",
" 'prolonged_stay_label',\n",
" 'processed_note',\n",
" 'charttime',\n",
" 'intime',\n",
" 'chartinterval'],\n",
" 'dates': ['charttime', 'intime'],\n",
" 'ia_thresh': {'lr': 0.45, 'rf': 0.27, 'gbm': 0.435, 'mlp': 0.2},\n",
" 'ps_thresh': {'lr': 0.39, 'rf': 0.36, 'gbm': 0.324, 'mlp': 0.27}}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import sys\n",
"sys.path.append('../')\n",
"\n",
"import math\n",
"import numpy as np\n",
"import pandas as pd\n",
"import spacy\n",
"\n",
"import seaborn as sns\n",
"sns.set(style = 'darkgrid')\n",
"\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"from scipy import stats\n",
"from pathlib import Path\n",
"\n",
"from utils.splits import set_group_splits\n",
"from args import args\n",
"vars(args)"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"## Stats"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:48:22.330446Z",
"start_time": "2019-07-17T18:48:17.056668Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df = pd.read_csv(args.dataset_csv)\n",
"ia_df = df.loc[(df['imminent_adm_label'] != -1)][args.imminent_adm_cols].reset_index(drop=True)\n",
"ps_df = ori_df.loc[(ori_df['chartinterval'] != 0)][args.prolonged_stay_cols].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:48:22.402048Z",
"start_time": "2019-07-17T18:48:22.333813Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df['subject_id'].nunique(), df['hadm_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:48:29.519915Z",
"start_time": "2019-07-17T18:48:29.443575Z"
},
"hidden": true
},
"outputs": [],
"source": [
"ages = df.groupby(['subject_id'])[['admission_age']].first().to_numpy().reshape(-1)\n",
"ages[ages>100] = 100\n",
"print(f\"Median age: {ages.mean():0.1f}\")\n",
"print(f\"IQR: {np.percentile(ages, 25):0.1f} - {np.percentile(ages, 75):0.1f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:48:53.210075Z",
"start_time": "2019-07-17T18:48:53.002656Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df['adm_to_icu_period'].describe().reset_index()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:49:21.918238Z",
"start_time": "2019-07-17T18:49:21.653850Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df.groupby(df['admission_type'])['hadm_id'].nunique().reset_index()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:49:22.411510Z",
"start_time": "2019-07-17T18:49:22.249868Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df.groupby(df['ethnicity'])['subject_id'].nunique().reset_index()"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"Make sure average prevalence of random test sets is approximately same as real prevalence"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:27:25.090684Z",
"start_time": "2019-07-17T18:27:05.095345Z"
},
"hidden": true
},
"outputs": [],
"source": [
"ia_p = []\n",
"ps_p = []\n",
"\n",
"for seed in range(127, 227):\n",
" sdf = set_group_splits(ia_df.copy(), group_col='hadm_id', seed=seed)\n",
" test_size = len(sdf.loc[(sdf['split'] == 'test')])\n",
" test_pos = len(sdf.loc[(sdf['split'] == 'test') & (sdf['imminent_adm_label'] == 1)])\n",
" ia_p.append(test_pos/test_size) \n",
" \n",
" sdf = set_group_splits(ps_df.copy(), group_col='hadm_id', seed=seed)\n",
" test_size = len(sdf.loc[(sdf['split'] == 'test')])\n",
" test_pos = len(sdf.loc[(sdf['split'] == 'test') & (sdf['prolonged_stay_label'] == 1)])\n",
" ps_p.append(test_pos/test_size) \n",
" \n",
"\n",
"ia_p = np.array(ia_p)\n",
"ps_p = np.array(ps_p)\n",
"\n",
"print(f\"Prevalence of Imminent Admission: {(len(ia_df.loc[ia_df['imminent_adm_label'] == 1])/len(ia_df)):0.3f}\")\n",
"print(f\"Average of test set = {(ia_p.mean()):0.3f}, std = {(ia_p.std()):0.3f}\")\n",
"print(f\"Prevalence of Prolonged Stay: {(len(ps_df.loc[ps_df['prolonged_stay_label'] == 1])/len(ps_df)):0.3f}\")\n",
"print(f\"Average of test set = {(ps_p.mean()):0.3f}, std = {(ps_p.std()):0.3f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:49:26.743454Z",
"start_time": "2019-07-17T18:49:26.652037Z"
},
"hidden": true
},
"outputs": [],
"source": [
"print(f\"Average number of notes per admission for imminent admission: {ia_df.groupby('hadm_id').size().mean():0.2f}\")\n",
"print(f\"Average number of notes per admission for prolonged stay (and entire dataset): {ps_df.groupby('hadm_id').size().mean():0.2f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:49:48.377622Z",
"start_time": "2019-07-17T18:49:48.020027Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df.groupby(df['deathtime'].apply(lambda x: True if pd.notnull(x) else False))['subject_id'].nunique().reset_index()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:49:49.030699Z",
"start_time": "2019-07-17T18:49:48.901279Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df.groupby(df['gender'])['subject_id'].nunique().reset_index()"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"Distribution of notes by category"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:49:51.015549Z",
"start_time": "2019-07-17T18:49:50.883550Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df.groupby(df['category']).size().reset_index()"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"Distribution of notes by category for imminent admissions and delayed admissions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:40:35.638983Z",
"start_time": "2019-07-17T18:40:35.428118Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df.loc[(df['imminent_adm_label'] == 1)].groupby('category').size().reset_index()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:42:29.970948Z",
"start_time": "2019-07-17T18:42:29.704004Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df.loc[(df['imminent_adm_label'] == 0)].groupby('category').size().reset_index()"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"Distribution of notes for prolonged stay and short stay"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:42:30.199045Z",
"start_time": "2019-07-17T18:42:29.974531Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df.loc[(df['prolonged_stay_label'] == 1)].groupby('category').size().reset_index()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:42:30.334632Z",
"start_time": "2019-07-17T18:42:30.202847Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df.loc[(df['prolonged_stay_label'] == 0)].groupby('category').size().reset_index()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:49:58.769470Z",
"start_time": "2019-07-17T18:49:58.658678Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df['icu_los'].describe().reset_index()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:52:01.571726Z",
"start_time": "2019-07-17T18:52:01.262084Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df['note'].apply(len).describe().reset_index()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:42:23.454748Z",
"start_time": "2019-07-17T18:42:23.043855Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df['charttime_to_icu_period'].describe().reset_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Plots"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2019-08-26T18:54:53.518650Z",
"start_time": "2019-08-26T18:54:50.602313Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['subject_id', 'hadm_id', 'icustay_id', 'admission_type', 'admittime',\n",
" 'dischtime', 'intime', 'outtime', 'charttime', 'icu_los', 'deathtime',\n",
" 'adm_to_icu_period', 'charttime_to_icu_period', 'chartinterval',\n",
" 'ethnicity', 'dob', 'gender', 'admission_age', 'category',\n",
" 'imminent_adm_label', 'prolonged_stay_label', 'note', 'processed_note'],\n",
" dtype='object')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(args.dataset_csv)\n",
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2019-08-26T18:56:35.688519Z",
"start_time": "2019-08-26T18:56:35.448942Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Radiology\n",
"VEN DUP EXTEXT BIL (MAP/DVT)\n",
"[**2162-5-17**] 8:12 AM\n",
" [**Last Name (un) 1296**] DUP EXTEXT BIL (MAP/DVT) Clip # [**Clip Number (Radiology) 18833**]\n",
" Reason: eval for vein harvesting*****OR second case on [**2162-5-17**]******\n",
" Admitting Diagnosis: CORONARY ARTERY DISEASE\n",
" ______________________________________________________________________________\n",
" [**Hospital 2**] MEDICAL CONDITION:\n",
" 60 year old man pre-op for CABG\n",
" REASON FOR THIS EXAMINATION:\n",
" eval for vein harvesting*****OR second case on [**2162-5-17**]********\n",
" ______________________________________________________________________________\n",
" FINAL REPORT\n",
" HISTORY: A 60-year-old gentleman, preop for CABG. Search for conduit.\n",
"\n",
" TECHNIQUE: Venous mapping of the superficial veins in the lower extremities\n",
" was performed with [**Doctor Last Name 37**]-scale and Doppler ultrasound.\n",
"\n",
" FINDINGS: Right great saphenous vein is patent and compressible with\n",
" diameters ranging between 0.15 and 0.21 cm. The right small saphenous vein is\n",
" patent and compressible with diameters ranging between 0.14 and 0.20 cm.\n",
"\n",
" The left great saphenous vein presented with diameters ranging between 0.06\n",
" and 0.49 cm. It was not visualized below the calf. The left small saphenous\n",
" vein presented thick walled and calcified.\n",
"\n",
" COMPARISON: None available.\n",
"\n",
" IMPRESSION: Patent right great and small saphenous veins, with diameters\n",
" described above. Left great saphenous vein with small diameters below the mid\n",
" thigh and not visualized below the calf. The left small saphenous vein\n",
" presented with thick walls and calcifications.\n",
"\n",
"\n",
"\n"
]
}
],
"source": [
"print(df.iloc[0]['note'])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2019-08-26T18:56:41.114437Z",
"start_time": "2019-08-26T18:56:41.086224Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Radiology \n",
" VEN DUP EXTEXT BIL ( MAP/DVT ) \n",
" [ * * 2162 - 5 - 17 * * ] 8:12 AM \n",
" [ * * Last Name ( un ) 1296 * * ] DUP EXTEXT BIL ( MAP/DVT ) Clip # [ * * Clip Number ( Radiology ) 18833 * * ] \n",
" Reason : eval for vein harvesting*****OR second case on [ * * 2162 - 5 - 17 * * ] * * * * * * \n",
" Admitting Diagnosis : CORONARY ARTERY DISEASE \n",
" _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n",
" [ * * Hospital 2 * * ] MEDICAL CONDITION : \n",
" 60 year old man pre-op for CABG \n",
" REASON FOR THIS EXAMINATION : \n",
" eval for vein harvesting*****OR second case on [ * * 2162 - 5 - 17 * * ] * * * * * * * * \n",
" _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n",
" FINAL REPORT \n",
" HISTORY : A 60-year-old gentleman , preop for CABG . Search for conduit . \n",
"\n",
" TECHNIQUE : Venous mapping of the superficial veins in the lower extremities \n",
" was performed with [ * * Doctor Last Name 37**]-scale and Doppler ultrasound . \n",
"\n",
" FINDINGS : Right great saphenous vein is patent and compressible with \n",
" diameters ranging between 0.15 and 0.21 cm . The right small saphenous vein is \n",
" patent and compressible with diameters ranging between 0.14 and 0.20 cm . \n",
"\n",
" The left great saphenous vein presented with diameters ranging between 0.06 \n",
" and 0.49 cm . It was not visualized below the calf . The left small saphenous \n",
" vein presented thick walled and calcified . \n",
"\n",
" COMPARISON : None available . \n",
"\n",
" IMPRESSION : Patent right great and small saphenous veins , with diameters \n",
" described above . Left great saphenous vein with small diameters below the mid \n",
" thigh and not visualized below the calf . The left small saphenous vein \n",
" presented with thick walls and calcifications . \n",
"\n",
"\n",
"\n"
]
}
],
"source": [
"print(df.iloc[0]['processed_note'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:42:36.032162Z",
"start_time": "2019-07-17T18:42:36.008573Z"
}
},
"outputs": [],
"source": [
"intervals = ['-1 ≤ t ≤ 0']\n",
"intervals += [f'-{i+1} ≤ t ≤ -{i}' for i in range(1, 15)]\n",
"intervals.append('t ≤ -15')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Bar Plot of Notes Over Days"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### All Notes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:42:38.585605Z",
"start_time": "2019-07-17T18:42:38.475792Z"
}
},
"outputs": [],
"source": [
"plot_df = pd.DataFrame(df.groupby(['chartinterval']).size(), columns=['n_notes'])\n",
"plot_df.reset_index(inplace=True, drop=True)\n",
"plot_df['days'] = intervals"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:42:41.669474Z",
"start_time": "2019-07-17T18:42:40.376671Z"
}
},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(15, 8))\n",
"sns.barplot(x='days', y='n_notes', data=plot_df, ax=ax)\n",
"ax.set_xticklabels(ax.get_xticklabels(),rotation=45, ha='right')\n",
"ax.set_xlabel('Time to ICU Admission (days)')\n",
"ax.set_ylabel('# notes')\n",
"for index, row in plot_df.iterrows():\n",
" ax.text(index, row['n_notes'], str(row['n_notes']), color='black', ha='center', va='bottom')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-06-27T00:38:13.014421Z",
"start_time": "2019-06-27T00:38:12.991010Z"
}
},
"outputs": [],
"source": [
"# fig.savefig(args.figdir/'note_bp.tif', dpi=300)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### By Category"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:43:10.933200Z",
"start_time": "2019-07-17T18:43:10.805412Z"
}
},
"outputs": [],
"source": [
"def plot_intervals(ax, df, cat):\n",
" sns.barplot(x='days', y='n_notes', data=df, ax=ax)\n",
" ax.set_xticklabels(ax.get_xticklabels(),rotation=45, ha='right')\n",
" ax.set_xlabel('')\n",
" ax.set_ylabel('')\n",
" ax.set_title(f\"Note Category: {cat}\\n# notes: {df['n_notes'].sum()}\") \n",
"\n",
" for index, (_, row) in enumerate(df.iterrows()):\n",
" ax.text(index, row['n_notes'], str(row['n_notes']), color='black', ha='center', va='bottom') "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:43:12.728434Z",
"start_time": "2019-07-17T18:43:12.610095Z"
}
},
"outputs": [],
"source": [
"plot_df = pd.DataFrame(df.groupby(['category', 'chartinterval']).size(), columns=['n_notes'])\n",
"plot_df.reset_index(inplace=True)\n",
"plot_df['days'] = plot_df['chartinterval'].apply(lambda x: intervals[x])\n",
"plot_df.drop(['chartinterval'], inplace=True, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:43:26.011143Z",
"start_time": "2019-07-17T18:43:15.024678Z"
},
"scrolled": false
},
"outputs": [],
"source": [
"fig, ax = plt.subplots(6, 2, figsize=(20, 50))\n",
"plot_intervals(ax[0][0], plot_df.loc[plot_df['category'] == 'Case Management ', ['n_notes', 'days']], 'Case Management')\n",
"plot_intervals(ax[0][1], plot_df.loc[plot_df['category'] == 'Consult', ['n_notes', 'days']], 'Consult')\n",
"\n",
"plot_intervals(ax[1][0], plot_df.loc[plot_df['category'] == 'General', ['n_notes', 'days']], 'General')\n",
"plot_intervals(ax[1][1], plot_df.loc[plot_df['category'] == 'Nursing', ['n_notes', 'days']], 'Nursing')\n",
"\n",
"plot_intervals(ax[2][0], plot_df.loc[plot_df['category'] == 'Nursing/other', ['n_notes', 'days']], 'Nursing/other')\n",
"plot_intervals(ax[2][1], plot_df.loc[plot_df['category'] == 'Nutrition', ['n_notes', 'days']], 'Nutrition')\n",
"\n",
"plot_intervals(ax[3][0], plot_df.loc[plot_df['category'] == 'Pharmacy', ['n_notes', 'days']], 'Pharmacy')\n",
"plot_intervals(ax[3][1], plot_df.loc[plot_df['category'] == 'Physician ', ['n_notes', 'days',]], 'Physician')\n",
"\n",
"plot_intervals(ax[4][0], plot_df.loc[plot_df['category'] == 'Radiology', ['n_notes', 'days']], 'Radiology')\n",
"plot_intervals(ax[4][1], plot_df.loc[plot_df['category'] == 'Rehab Services', ['n_notes', 'days']], 'Rehab Services')\n",
"\n",
"plot_intervals(ax[5][0], plot_df.loc[plot_df['category'] == 'Respiratory ', ['n_notes', 'days']], 'Respiratory')\n",
"plot_intervals(ax[5][1], plot_df.loc[plot_df['category'] == 'Social Work', ['n_notes', 'days']], 'Social Work')\n",
"\n",
"fig.text(0.5, 0.1, 'Time to ICU Admission (days)', ha='center')\n",
"fig.text(0.08, 0.5, '# notes', va='center', rotation='vertical')\n",
"\n",
"plt.subplots_adjust(hspace = 0.3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-06-27T00:42:13.420913Z",
"start_time": "2019-06-27T00:42:13.395654Z"
}
},
"outputs": [],
"source": [
"# cats = sorted(list(df['category'].unique()))\n",
"\n",
"# n = 0\n",
"# fig, ax = plt.subplots(1, 1, figsize=(10, 8))\n",
"# plot_intervals(ax, plot_df.loc[plot_df['category'] == cats[n], ['n_notes', 'days']], cats[n])\n",
"# ax.set_xlabel('Time to ICU Admission (days)')\n",
"# ax.set_ylabel('# notes')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-06-26T19:55:34.228896Z",
"start_time": "2019-06-26T19:55:34.204962Z"
},
"scrolled": false
},
"outputs": [],
"source": [
"# fig.savefig(args.figdir/'note_cats_bp.tif', dpi=300)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Note Chart Time to ICU Admission Period Histogram"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### All Notes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:43:56.456943Z",
"start_time": "2019-07-17T18:43:56.330361Z"
}
},
"outputs": [],
"source": [
"plot_df = df[['category', 'charttime_to_icu_period']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:44:05.284861Z",
"start_time": "2019-07-17T18:44:03.949948Z"
}
},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(10, 8))\n",
"\n",
"sns.distplot(plot_df['charttime_to_icu_period'], kde=False, ax=ax, bins=80)\n",
"ax.set_xlabel('Period between Note Chart Time and ICU Admission Time (days)')\n",
"ax.set_ylabel('# notes')\n",
"ax.set_xlim(0, 60)\n",
"\n",
"# ax.text(ax.get_xlim()[1]*0.50, ax.get_ylim()[1]*0.80, f\"Min: {mdf['chart_icu_period'].min()}, Avg: {mdf['chart_icu_period'].mean(): 0.2f}, Max: {mdf['chart_icu_period'].max()}\", fontweight='bold', fontsize=15, ha='center', va='bottom')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-06-26T19:55:34.712151Z",
"start_time": "2019-06-26T19:55:34.686551Z"
}
},
"outputs": [],
"source": [
"# fig.savefig(args.figdir/'note_icu_period_hist.tif', dpi=300)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### By Category"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:44:13.676427Z",
"start_time": "2019-07-17T18:44:13.571725Z"
}
},
"outputs": [],
"source": [
"def plot_period(ax, df, cat):\n",
" sns.distplot(df, kde=False, ax=ax, bins=10)\n",
" ax.set_xlabel('')\n",
" ax.set_ylabel('')\n",
" ax.set_title(f\"Note Category: {cat}\") "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:45:21.689010Z",
"start_time": "2019-07-17T18:45:12.353337Z"
},
"scrolled": false
},
"outputs": [],
"source": [
"fig, ax = plt.subplots(6, 2, figsize=(20, 50))\n",
"plot_period(ax[0][0], plot_df.loc[plot_df['category'] == 'Case Management ', ['charttime_to_icu_period']], 'Case Management')\n",
"plot_period(ax[0][1], plot_df.loc[plot_df['category'] == 'Consult', ['charttime_to_icu_period']], 'Consult')\n",
"\n",
"plot_period(ax[1][0], plot_df.loc[plot_df['category'] == 'General', ['charttime_to_icu_period']], 'General')\n",
"plot_period(ax[1][1], plot_df.loc[plot_df['category'] == 'Nursing', ['charttime_to_icu_period']], 'Nursing')\n",
"\n",
"plot_period(ax[2][0], plot_df.loc[plot_df['category'] == 'Nursing/other', ['charttime_to_icu_period']], 'Nursing/other')\n",
"plot_period(ax[2][1], plot_df.loc[plot_df['category'] == 'Nutrition', ['charttime_to_icu_period']], 'Nutrition')\n",
"\n",
"plot_period(ax[3][0], plot_df.loc[plot_df['category'] == 'Pharmacy', ['charttime_to_icu_period']], 'Pharmacy')\n",
"plot_period(ax[3][1], plot_df.loc[plot_df['category'] == 'Physician ', ['charttime_to_icu_period',]], 'Physician')\n",
"\n",
"plot_period(ax[4][0], plot_df.loc[plot_df['category'] == 'Radiology', ['charttime_to_icu_period']], 'Radiology')\n",
"plot_period(ax[4][1], plot_df.loc[plot_df['category'] == 'Rehab Services', ['charttime_to_icu_period']], 'Rehab Services')\n",
"\n",
"plot_period(ax[5][0], plot_df.loc[plot_df['category'] == 'Respiratory ', ['charttime_to_icu_period']], 'Respiratory')\n",
"plot_period(ax[5][1], plot_df.loc[plot_df['category'] == 'Social Work', ['charttime_to_icu_period']], 'Social Work')\n",
"\n",
"fig.text(0.5, 0.11, 'Period between Note Chart Time and ICU Admission Time (days)', ha='center')\n",
"fig.text(0.08, 0.5, '# notes', va='center', rotation='vertical')\n",
"\n",
"plt.subplots_adjust(hspace = 0.1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-06-27T00:43:24.745337Z",
"start_time": "2019-06-27T00:43:24.720208Z"
}
},
"outputs": [],
"source": [
"# cats = sorted(list(df['category'].unique()))\n",
"\n",
"# n = 0\n",
"# fig, ax = plt.subplots(1, 1, figsize=(10, 8))\n",
"# plot_period(ax, plot_df.loc[plot_df['category'] == cats[n], ['chart_icu_period']], cats[n])\n",
"# ax.set_xlabel('Time to ICU Admission (days)')\n",
"# ax.set_ylabel('# notes')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-06-26T19:35:38.476961Z",
"start_time": "2019-06-26T19:35:38.451886Z"
},
"scrolled": false
},
"outputs": [],
"source": [
"# fig.savefig(args.figdir/'note_cat_icu_period_hist.tif', dpi=300)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Hospital Admission to ICU Admission Period Histogram"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:45:44.547021Z",
"start_time": "2019-07-17T18:45:44.519812Z"
}
},
"outputs": [],
"source": [
"plot_df = df[['adm_to_icu_period']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:45:46.580796Z",
"start_time": "2019-07-17T18:45:45.217784Z"
}
},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(10, 8))\n",
"\n",
"sns.distplot(plot_df, kde=False, ax=ax, bins=80)\n",
"ax.set_xlabel('Time between hospital admission and ICU admission (days)')\n",
"ax.set_ylabel('# notes')\n",
"ax.set_xlim(0, 70)\n",
"# ax.text(ax.get_xlim()[1]*0.50, ax.get_ylim()[1]*0.80, f\"Min: {mdf['adm_icu_period'].min()}, Avg: {mdf['adm_icu_period'].mean(): 0.2f}, Max: {mdf['adm_icu_period'].max()}\", fontweight='bold', fontsize=15, ha='center', va='bottom') "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# fig.savefig(args.figdir/'adm_icu_period_hist.tif', dpi=300)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Note Length Histogram"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:45:50.860829Z",
"start_time": "2019-07-17T18:45:49.137114Z"
}
},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(10, 8))\n",
"sns.distplot(df['note'].apply(len), kde=False, ax=ax, bins=100)\n",
"ax.set_xlabel('Length of Note (characters)')\n",
"ax.set_ylabel('# notes')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-06-26T19:59:38.291139Z",
"start_time": "2019-06-26T19:59:38.267860Z"
}
},
"outputs": [],
"source": [
"# fig.savefig(args.figdir/'note_len_hist.tif', dpi=300)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Imminent ICU Prediction Class Distribution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:53:53.526861Z",
"start_time": "2019-07-17T18:53:53.429558Z"
}
},
"outputs": [],
"source": [
"desc = ['Unused', 'Delayed Admission', 'Imminent Admission']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Without Admissions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:53:54.551036Z",
"start_time": "2019-07-17T18:53:54.423540Z"
}
},
"outputs": [],
"source": [
"plot_df = pd.DataFrame(df.groupby(['imminent_adm_label']).size(), columns=['n_notes']).reset_index()\n",
"plot_df['imminent_adm_label'] = desc\n",
"plot_df = plot_df.reindex([2, 1, 0])\n",
"plot_df.reset_index(inplace=True, drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:53:55.840770Z",
"start_time": "2019-07-17T18:53:54.913513Z"
}
},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(10, 8))\n",
"sns.barplot(x='imminent_adm_label', y='n_notes', data=plot_df, ax=ax)\n",
"ax.set_xlabel('Imminent Class Label')\n",
"ax.set_ylabel('# notes')\n",
"for index, row in plot_df.iterrows():\n",
" ax.text(index+0.05, row['n_notes']+50, str(row['n_notes']), color='black', ha='right', va='bottom')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-06-27T01:07:18.818779Z",
"start_time": "2019-06-27T01:07:18.795768Z"
}
},
"outputs": [],
"source": [
"# fig.savefig(args.figdir/'imminent_label_bp.tif', dpi=300)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### With Admissions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:54:20.657113Z",
"start_time": "2019-07-17T18:54:20.298763Z"
}
},
"outputs": [],
"source": [
"p1 = pd.DataFrame(df.groupby(['imminent_adm_label']).size(), columns=['n_notes']).reset_index()\n",
"p2 = df.groupby(['imminent_adm_label'])['hadm_id'].nunique().reset_index()\n",
"p = p1.merge(p2, on=['imminent_adm_label'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:54:20.757964Z",
"start_time": "2019-07-17T18:54:20.660979Z"
}
},
"outputs": [],
"source": [
"p['imminent_adm_label'] = desc"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:54:21.287840Z",
"start_time": "2019-07-17T18:54:21.204792Z"
}
},
"outputs": [],
"source": [
"p = p.reindex([2,1,0])\n",
"p.reset_index(inplace=True, drop=True)\n",
"p"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:54:29.367296Z",
"start_time": "2019-07-17T18:54:29.263198Z"
}
},
"outputs": [],
"source": [
"plot_df = p.copy()\n",
"plot_df.rename(columns={'hadm_id':'# Admissions', 'n_notes':'# Notes'}, inplace=True)\n",
"plot_df = pd.melt(plot_df, id_vars='imminent_adm_label', var_name='Legend', value_name='counts')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T18:54:35.592328Z",
"start_time": "2019-07-17T18:54:34.576044Z"
}
},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(10, 8))\n",
"\n",
"sns.barplot(x='imminent_adm_label', y='counts', hue='Legend', data=plot_df, ax=ax)\n",
"ax.set_xticklabels(ax.get_xticklabels(), ha='right')\n",
"ax.set_xlabel('Imminent Class Label')\n",
"ax.set_ylabel('# notes')\n",
"\n",
"for index, row in plot_df.iterrows():\n",
" if index < len(plot_df)//2:\n",
" ax.text(index-0.13, row['counts']+50, str(row['counts']), color='black', ha='right', va='bottom')\n",
" else:\n",
" ax.text(index % (len(plot_df)//2)+0.25, row['counts']+50, str(row['counts']), color='black', ha='right', va='bottom')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# fig.savefig(args.figdir/'imminent_label_adms_bp.tif', dpi=300)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Prolonged Stay Class Distribution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T19:00:53.843117Z",
"start_time": "2019-07-17T19:00:53.541066Z"
}
},
"outputs": [],
"source": [
"desc = ['Short Stay', 'Prolonged Stay']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Without Admissions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T19:01:08.738416Z",
"start_time": "2019-07-17T19:01:08.586921Z"
}
},
"outputs": [],
"source": [
"plot_df = pd.DataFrame(df.groupby(['prolonged_stay_label']).size(), columns=['n_notes']).reset_index()\n",
"plot_df['prolonged_stay_label'] = desc\n",
"plot_df = plot_df.reindex([1, 0])\n",
"plot_df.reset_index(inplace=True, drop=True)\n",
"plot_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T19:01:18.296482Z",
"start_time": "2019-07-17T19:01:17.775519Z"
}
},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(10, 8))\n",
"sns.barplot(x='prolonged_stay_label', y='n_notes', data=plot_df, ax=ax)\n",
"ax.set_xlabel('5 Day Discharge Class Label')\n",
"ax.set_ylabel('# notes')\n",
"for index, row in plot_df.iterrows():\n",
" ax.text(index+0.05, row['n_notes']+50, str(row['n_notes']), color='black', ha='right', va='bottom')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-06-30T21:09:10.237355Z",
"start_time": "2019-06-30T21:09:10.163Z"
}
},
"outputs": [],
"source": [
"# fig.savefig(args.figdir/'discharge_label_bp.tif', dpi=300)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### With Admissions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T19:01:34.791633Z",
"start_time": "2019-07-17T19:01:34.568783Z"
}
},
"outputs": [],
"source": [
"p1 = pd.DataFrame(df.groupby(['prolonged_stay_label']).size(), columns=['n_notes']).reset_index()\n",
"p2 = df.groupby(['prolonged_stay_label'])['hadm_id'].nunique().reset_index()\n",
"p = p1.merge(p2, on=['prolonged_stay_label'])\n",
"p['prolonged_stay_label'] = desc\n",
"p = p.reindex([1,0])\n",
"p.reset_index(inplace=True, drop=True)\n",
"p"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T19:01:42.249351Z",
"start_time": "2019-07-17T19:01:42.137270Z"
}
},
"outputs": [],
"source": [
"plot_df = p.copy()\n",
"plot_df.rename(columns={'hadm_id':'# Admissions', 'n_notes':'# Notes'}, inplace=True)\n",
"plot_df = pd.melt(plot_df, id_vars='prolonged_stay_label', var_name='Legend', value_name='counts')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-17T19:01:47.756030Z",
"start_time": "2019-07-17T19:01:47.553253Z"
}
},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(10, 8))\n",
"\n",
"sns.barplot(x='prolonged_stay_label', y='counts', hue='Legend', data=plot_df, ax=ax)\n",
"ax.set_xticklabels(ax.get_xticklabels(), ha='right')\n",
"ax.set_xlabel('5 Day Discharge Class Label')\n",
"ax.set_ylabel('# notes')\n",
"\n",
"for index, row in plot_df.iterrows():\n",
" if index < len(plot_df)//2:\n",
" ax.text(index-0.13, row['counts']+50, str(row['counts']), color='black', ha='right', va='bottom')\n",
" else:\n",
" ax.text(index % (len(plot_df)//2)+0.25, row['counts']+50, str(row['counts']), color='black', ha='right', va='bottom')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# fig.savefig(args.figdir/'discharge_label_adms_bp.tif', dpi=300)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": true,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "165px"
},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}