1654 lines (1653 with data), 69.2 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Loading the data and required libraries"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import re\n",
"import datetime\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import string\n",
"import nltk\n",
"from nltk import word_tokenize\n",
"from nltk.stem.porter import PorterStemmer\n",
"from nltk.corpus import stopwords"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"DIR = \"E:/Coding/Summer 2023/data/\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"notes_df = pd.read_csv(DIR + \"NOTEEVENTS.csv\", low_memory=False, memory_map=True)\n",
"admissions_df = pd.read_csv(DIR + \"ADMISSIONS.csv\", low_memory=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Exploration\n",
"## Admissions"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ROW_ID</th>\n",
" <th>SUBJECT_ID</th>\n",
" <th>HADM_ID</th>\n",
" <th>ADMITTIME</th>\n",
" <th>DISCHTIME</th>\n",
" <th>DEATHTIME</th>\n",
" <th>ADMISSION_TYPE</th>\n",
" <th>ADMISSION_LOCATION</th>\n",
" <th>DISCHARGE_LOCATION</th>\n",
" <th>INSURANCE</th>\n",
" <th>LANGUAGE</th>\n",
" <th>RELIGION</th>\n",
" <th>MARITAL_STATUS</th>\n",
" <th>ETHNICITY</th>\n",
" <th>EDREGTIME</th>\n",
" <th>EDOUTTIME</th>\n",
" <th>DIAGNOSIS</th>\n",
" <th>HOSPITAL_EXPIRE_FLAG</th>\n",
" <th>HAS_CHARTEVENTS_DATA</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>21</td>\n",
" <td>22</td>\n",
" <td>165315</td>\n",
" <td>2196-04-09 12:26:00</td>\n",
" <td>2196-04-10 15:54:00</td>\n",
" <td>NaN</td>\n",
" <td>EMERGENCY</td>\n",
" <td>EMERGENCY ROOM ADMIT</td>\n",
" <td>DISC-TRAN CANCER/CHLDRN H</td>\n",
" <td>Private</td>\n",
" <td>NaN</td>\n",
" <td>UNOBTAINABLE</td>\n",
" <td>MARRIED</td>\n",
" <td>WHITE</td>\n",
" <td>2196-04-09 10:06:00</td>\n",
" <td>2196-04-09 13:24:00</td>\n",
" <td>BENZODIAZEPINE OVERDOSE</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>22</td>\n",
" <td>23</td>\n",
" <td>152223</td>\n",
" <td>2153-09-03 07:15:00</td>\n",
" <td>2153-09-08 19:10:00</td>\n",
" <td>NaN</td>\n",
" <td>ELECTIVE</td>\n",
" <td>PHYS REFERRAL/NORMAL DELI</td>\n",
" <td>HOME HEALTH CARE</td>\n",
" <td>Medicare</td>\n",
" <td>NaN</td>\n",
" <td>CATHOLIC</td>\n",
" <td>MARRIED</td>\n",
" <td>WHITE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>CORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>23</td>\n",
" <td>23</td>\n",
" <td>124321</td>\n",
" <td>2157-10-18 19:34:00</td>\n",
" <td>2157-10-25 14:00:00</td>\n",
" <td>NaN</td>\n",
" <td>EMERGENCY</td>\n",
" <td>TRANSFER FROM HOSP/EXTRAM</td>\n",
" <td>HOME HEALTH CARE</td>\n",
" <td>Medicare</td>\n",
" <td>ENGL</td>\n",
" <td>CATHOLIC</td>\n",
" <td>MARRIED</td>\n",
" <td>WHITE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>BRAIN MASS</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>24</td>\n",
" <td>24</td>\n",
" <td>161859</td>\n",
" <td>2139-06-06 16:14:00</td>\n",
" <td>2139-06-09 12:48:00</td>\n",
" <td>NaN</td>\n",
" <td>EMERGENCY</td>\n",
" <td>TRANSFER FROM HOSP/EXTRAM</td>\n",
" <td>HOME</td>\n",
" <td>Private</td>\n",
" <td>NaN</td>\n",
" <td>PROTESTANT QUAKER</td>\n",
" <td>SINGLE</td>\n",
" <td>WHITE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>INTERIOR MYOCARDIAL INFARCTION</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>25</td>\n",
" <td>25</td>\n",
" <td>129635</td>\n",
" <td>2160-11-02 02:06:00</td>\n",
" <td>2160-11-05 14:55:00</td>\n",
" <td>NaN</td>\n",
" <td>EMERGENCY</td>\n",
" <td>EMERGENCY ROOM ADMIT</td>\n",
" <td>HOME</td>\n",
" <td>Private</td>\n",
" <td>NaN</td>\n",
" <td>UNOBTAINABLE</td>\n",
" <td>MARRIED</td>\n",
" <td>WHITE</td>\n",
" <td>2160-11-02 01:01:00</td>\n",
" <td>2160-11-02 04:27:00</td>\n",
" <td>ACUTE CORONARY SYNDROME</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ROW_ID SUBJECT_ID HADM_ID ADMITTIME DISCHTIME \\\n",
"0 21 22 165315 2196-04-09 12:26:00 2196-04-10 15:54:00 \n",
"1 22 23 152223 2153-09-03 07:15:00 2153-09-08 19:10:00 \n",
"2 23 23 124321 2157-10-18 19:34:00 2157-10-25 14:00:00 \n",
"3 24 24 161859 2139-06-06 16:14:00 2139-06-09 12:48:00 \n",
"4 25 25 129635 2160-11-02 02:06:00 2160-11-05 14:55:00 \n",
"\n",
" DEATHTIME ADMISSION_TYPE ADMISSION_LOCATION \\\n",
"0 NaN EMERGENCY EMERGENCY ROOM ADMIT \n",
"1 NaN ELECTIVE PHYS REFERRAL/NORMAL DELI \n",
"2 NaN EMERGENCY TRANSFER FROM HOSP/EXTRAM \n",
"3 NaN EMERGENCY TRANSFER FROM HOSP/EXTRAM \n",
"4 NaN EMERGENCY EMERGENCY ROOM ADMIT \n",
"\n",
" DISCHARGE_LOCATION INSURANCE LANGUAGE RELIGION \\\n",
"0 DISC-TRAN CANCER/CHLDRN H Private NaN UNOBTAINABLE \n",
"1 HOME HEALTH CARE Medicare NaN CATHOLIC \n",
"2 HOME HEALTH CARE Medicare ENGL CATHOLIC \n",
"3 HOME Private NaN PROTESTANT QUAKER \n",
"4 HOME Private NaN UNOBTAINABLE \n",
"\n",
" MARITAL_STATUS ETHNICITY EDREGTIME EDOUTTIME \\\n",
"0 MARRIED WHITE 2196-04-09 10:06:00 2196-04-09 13:24:00 \n",
"1 MARRIED WHITE NaN NaN \n",
"2 MARRIED WHITE NaN NaN \n",
"3 SINGLE WHITE NaN NaN \n",
"4 MARRIED WHITE 2160-11-02 01:01:00 2160-11-02 04:27:00 \n",
"\n",
" DIAGNOSIS HOSPITAL_EXPIRE_FLAG \\\n",
"0 BENZODIAZEPINE OVERDOSE 0 \n",
"1 CORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS... 0 \n",
"2 BRAIN MASS 0 \n",
"3 INTERIOR MYOCARDIAL INFARCTION 0 \n",
"4 ACUTE CORONARY SYNDROME 0 \n",
"\n",
" HAS_CHARTEVENTS_DATA \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"admissions_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(58976, 19)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"admissions_df.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Types of admissions"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['EMERGENCY', 'ELECTIVE', 'NEWBORN', 'URGENT'], dtype=object)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"admissions_df['ADMISSION_TYPE'].unique()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check for missing values on the admission times"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(admissions_df['ADMITTIME'].isnull())\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Conversion of times to datetime type"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"admissions_df['ADMITTIME'] = admissions_df['ADMITTIME'].astype('datetime64[ns]')\n",
"admissions_df['DISCHTIME'] = admissions_df['DISCHTIME'].astype('datetime64[ns]')\n",
"admissions_df['DEATHTIME'] = admissions_df['DEATHTIME'].astype('datetime64[ns]')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Sort by subject and admission type and reset the data frame index."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"admissions_df = admissions_df.sort_values(['SUBJECT_ID', 'ADMITTIME'])\n",
"admissions_df = admissions_df.reset_index(drop = True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"admissions_df['NEXT_ADMITTIME'] = admissions_df.groupby('SUBJECT_ID').ADMITTIME.shift(-1)\n",
"admissions_df['NEXT_ADMISSION_TYPE'] = admissions_df.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"admissions_df.loc[admissions_df['NEXT_ADMISSION_TYPE'] == 'ELECTIVE', 'NEXT_ADMITTIME'] = pd.NaT\n",
"admissions_df.loc[admissions_df['NEXT_ADMISSION_TYPE'] == 'ELECTIVE', 'NEXT_ADMISSION_TYPE'] = np.NaN"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Fill NA's with the next valid value. Previously sorted."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"admissions_df[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']] = admissions_df.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']].fillna(method = 'bfill')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Obtain days to readmission: from discharge to next readmission"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"admissions_df['DAYS_TO_READMISSION'] = (admissions_df['NEXT_ADMITTIME'] - admissions_df['DISCHTIME']).dt.days"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Number of readmissions"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"11399"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(admissions_df['DAYS_TO_READMISSION'].notnull())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Distribution of days to readmission"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 700x350 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.set(rc={'figure.figsize':(7,3.5), 'axes.grid':True})\n",
"sns.set_style(\"whitegrid\", {'axes.grid' : False})\n",
"fig = sns.histplot(admissions_df['DAYS_TO_READMISSION'], kde=False, bins=15)\n",
"fig = fig.get_figure()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"120.0\n",
"507.0\n",
"23.0\n",
"408.8103342398456\n"
]
},
{
"data": {
"text/plain": [
"4107.0"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(admissions_df['DAYS_TO_READMISSION'].quantile(0.5))\n",
"print(admissions_df['DAYS_TO_READMISSION'].quantile(0.75))\n",
"print(admissions_df['DAYS_TO_READMISSION'].quantile(0.25))\n",
"print(admissions_df['DAYS_TO_READMISSION'].mean())\n",
"admissions_df['DAYS_TO_READMISSION'].max()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Notes"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ROW_ID</th>\n",
" <th>SUBJECT_ID</th>\n",
" <th>HADM_ID</th>\n",
" <th>CHARTDATE</th>\n",
" <th>CHARTTIME</th>\n",
" <th>STORETIME</th>\n",
" <th>CATEGORY</th>\n",
" <th>DESCRIPTION</th>\n",
" <th>CGID</th>\n",
" <th>ISERROR</th>\n",
" <th>TEXT</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>174</td>\n",
" <td>22532</td>\n",
" <td>167853.0</td>\n",
" <td>2151-08-04</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Discharge summary</td>\n",
" <td>Report</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Admission Date: [**2151-7-16**] Dischar...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>175</td>\n",
" <td>13702</td>\n",
" <td>107527.0</td>\n",
" <td>2118-06-14</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Discharge summary</td>\n",
" <td>Report</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Admission Date: [**2118-6-2**] Discharg...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>176</td>\n",
" <td>13702</td>\n",
" <td>167118.0</td>\n",
" <td>2119-05-25</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Discharge summary</td>\n",
" <td>Report</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Admission Date: [**2119-5-4**] D...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>177</td>\n",
" <td>13702</td>\n",
" <td>196489.0</td>\n",
" <td>2124-08-18</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Discharge summary</td>\n",
" <td>Report</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Admission Date: [**2124-7-21**] ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>178</td>\n",
" <td>26880</td>\n",
" <td>135453.0</td>\n",
" <td>2162-03-25</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Discharge summary</td>\n",
" <td>Report</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Admission Date: [**2162-3-3**] D...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ROW_ID SUBJECT_ID HADM_ID CHARTDATE CHARTTIME STORETIME \\\n",
"0 174 22532 167853.0 2151-08-04 NaN NaN \n",
"1 175 13702 107527.0 2118-06-14 NaN NaN \n",
"2 176 13702 167118.0 2119-05-25 NaN NaN \n",
"3 177 13702 196489.0 2124-08-18 NaN NaN \n",
"4 178 26880 135453.0 2162-03-25 NaN NaN \n",
"\n",
" CATEGORY DESCRIPTION CGID ISERROR \\\n",
"0 Discharge summary Report NaN NaN \n",
"1 Discharge summary Report NaN NaN \n",
"2 Discharge summary Report NaN NaN \n",
"3 Discharge summary Report NaN NaN \n",
"4 Discharge summary Report NaN NaN \n",
"\n",
" TEXT \n",
"0 Admission Date: [**2151-7-16**] Dischar... \n",
"1 Admission Date: [**2118-6-2**] Discharg... \n",
"2 Admission Date: [**2119-5-4**] D... \n",
"3 Admission Date: [**2124-7-21**] ... \n",
"4 Admission Date: [**2162-3-3**] D... "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"notes_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2083180, 11)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"notes_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Discharge summary', 'Echo', 'ECG', 'Nursing', 'Physician ',\n",
" 'Rehab Services', 'Case Management ', 'Respiratory ', 'Nutrition',\n",
" 'General', 'Social Work', 'Pharmacy', 'Consult', 'Radiology',\n",
" 'Nursing/other'], dtype=object)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"notes_df['CATEGORY'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(59652, 11)"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"discharge_notes = notes_df[notes_df['CATEGORY'] == \"Discharge summary\"]\n",
"discharge_notes.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"There are 6926 admissions with more than one discharge note (HADM_ID - ID of Admissions)."
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6926"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"discharge_notes.duplicated(['HADM_ID']).sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Take the last row per admission"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"discharge_notes_ordered = discharge_notes.groupby(['SUBJECT_ID', 'HADM_ID']).nth(-1).reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"discharge_notes_ordered.duplicated(['HADM_ID']).sum()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(52726, 11)"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"discharge_notes_ordered.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Merge Notes and Admissions"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"admissions_notes = pd.merge(\n",
" admissions_df[['SUBJECT_ID','HADM_ID','ADMITTIME','DISCHTIME','DAYS_TO_READMISSION','NEXT_ADMITTIME','ADMISSION_TYPE','DEATHTIME']],\n",
" discharge_notes_ordered[['SUBJECT_ID', 'HADM_ID', 'TEXT']], \n",
" on = ['SUBJECT_ID', 'HADM_ID'], how='left'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SUBJECT_ID</th>\n",
" <th>HADM_ID</th>\n",
" <th>ADMITTIME</th>\n",
" <th>DISCHTIME</th>\n",
" <th>DAYS_TO_READMISSION</th>\n",
" <th>NEXT_ADMITTIME</th>\n",
" <th>ADMISSION_TYPE</th>\n",
" <th>DEATHTIME</th>\n",
" <th>TEXT</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>163353</td>\n",
" <td>2138-07-17 19:04:00</td>\n",
" <td>2138-07-21 15:48:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NEWBORN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>145834</td>\n",
" <td>2101-10-20 19:08:00</td>\n",
" <td>2101-10-31 13:58:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>EMERGENCY</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2101-10-20**] Discharg...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4</td>\n",
" <td>185777</td>\n",
" <td>2191-03-16 00:28:00</td>\n",
" <td>2191-03-23 18:41:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>EMERGENCY</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2191-3-16**] Discharge...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>5</td>\n",
" <td>178980</td>\n",
" <td>2103-02-02 04:31:00</td>\n",
" <td>2103-02-04 12:15:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NEWBORN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>6</td>\n",
" <td>107064</td>\n",
" <td>2175-05-30 07:15:00</td>\n",
" <td>2175-06-15 16:00:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>ELECTIVE</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2175-5-30**] Dischar...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58971</th>\n",
" <td>99985</td>\n",
" <td>176670</td>\n",
" <td>2181-01-27 02:47:00</td>\n",
" <td>2181-02-12 17:05:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>EMERGENCY</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2181-1-27**] ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58972</th>\n",
" <td>99991</td>\n",
" <td>151118</td>\n",
" <td>2184-12-24 08:30:00</td>\n",
" <td>2185-01-05 12:15:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>ELECTIVE</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2184-12-24**] ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58973</th>\n",
" <td>99992</td>\n",
" <td>197084</td>\n",
" <td>2144-07-25 18:03:00</td>\n",
" <td>2144-07-28 17:56:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>EMERGENCY</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2144-7-25**] ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58974</th>\n",
" <td>99995</td>\n",
" <td>137810</td>\n",
" <td>2147-02-08 08:00:00</td>\n",
" <td>2147-02-11 13:15:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>ELECTIVE</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2147-2-8**] D...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58975</th>\n",
" <td>99999</td>\n",
" <td>113369</td>\n",
" <td>2117-12-30 07:15:00</td>\n",
" <td>2118-01-04 16:30:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>ELECTIVE</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2117-12-30**] ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>58976 rows × 9 columns</p>\n",
"</div>"
],
"text/plain": [
" SUBJECT_ID HADM_ID ADMITTIME DISCHTIME \\\n",
"0 2 163353 2138-07-17 19:04:00 2138-07-21 15:48:00 \n",
"1 3 145834 2101-10-20 19:08:00 2101-10-31 13:58:00 \n",
"2 4 185777 2191-03-16 00:28:00 2191-03-23 18:41:00 \n",
"3 5 178980 2103-02-02 04:31:00 2103-02-04 12:15:00 \n",
"4 6 107064 2175-05-30 07:15:00 2175-06-15 16:00:00 \n",
"... ... ... ... ... \n",
"58971 99985 176670 2181-01-27 02:47:00 2181-02-12 17:05:00 \n",
"58972 99991 151118 2184-12-24 08:30:00 2185-01-05 12:15:00 \n",
"58973 99992 197084 2144-07-25 18:03:00 2144-07-28 17:56:00 \n",
"58974 99995 137810 2147-02-08 08:00:00 2147-02-11 13:15:00 \n",
"58975 99999 113369 2117-12-30 07:15:00 2118-01-04 16:30:00 \n",
"\n",
" DAYS_TO_READMISSION NEXT_ADMITTIME ADMISSION_TYPE DEATHTIME \\\n",
"0 NaN NaT NEWBORN NaT \n",
"1 NaN NaT EMERGENCY NaT \n",
"2 NaN NaT EMERGENCY NaT \n",
"3 NaN NaT NEWBORN NaT \n",
"4 NaN NaT ELECTIVE NaT \n",
"... ... ... ... ... \n",
"58971 NaN NaT EMERGENCY NaT \n",
"58972 NaN NaT ELECTIVE NaT \n",
"58973 NaN NaT EMERGENCY NaT \n",
"58974 NaN NaT ELECTIVE NaT \n",
"58975 NaN NaT ELECTIVE NaT \n",
"\n",
" TEXT \n",
"0 NaN \n",
"1 Admission Date: [**2101-10-20**] Discharg... \n",
"2 Admission Date: [**2191-3-16**] Discharge... \n",
"3 NaN \n",
"4 Admission Date: [**2175-5-30**] Dischar... \n",
"... ... \n",
"58971 Admission Date: [**2181-1-27**] ... \n",
"58972 Admission Date: [**2184-12-24**] ... \n",
"58973 Admission Date: [**2144-7-25**] ... \n",
"58974 Admission Date: [**2147-2-8**] D... \n",
"58975 Admission Date: [**2117-12-30**] ... \n",
"\n",
"[58976 rows x 9 columns]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"admissions_notes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"10% of admissions without discharge notes."
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.1059753119913185"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(admissions_notes['TEXT'].isnull()) / len(admissions_notes)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"53% of NEWBORN are missing"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ADMISSION_TYPE\n",
"ELECTIVE 0.048663\n",
"EMERGENCY 0.037983\n",
"NEWBORN 0.536691\n",
"URGENT 0.042665\n",
"dtype: float64"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"admissions_notes.groupby('ADMISSION_TYPE').apply(lambda g: g.TEXT.isnull().sum())/admissions_notes.groupby('ADMISSION_TYPE').size()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Remove NEWBORN admissions and create the target variable"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"adm_notes = admissions_notes[admissions_notes['ADMISSION_TYPE'] != 'NEWBORN'].copy()\n",
"adm_notes['READM_WITHIN_30'] = (adm_notes['DAYS_TO_READMISSION'] < 30).astype('int')"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3004"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(adm_notes['READM_WITHIN_30'])"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(51113, 10)"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"adm_notes.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Exlude patients that died during the admission"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"rows_not_death = adm_notes['DEATHTIME'].isnull()\n",
"df_adm_notes_not_death = adm_notes.loc[rows_not_death].copy()\n",
"df_adm_notes_not_death = df_adm_notes_not_death.sample(n = len(df_adm_notes_not_death))\n",
"df_adm_notes_not_death = df_adm_notes_not_death.reset_index(drop = True)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2963"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(df_adm_notes_not_death['READM_WITHIN_30'])"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"45321"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df_adm_notes_not_death['READM_WITHIN_30'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Creation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Exlude patients that died during the admission"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"adm_notes = adm_notes.loc[adm_notes['DEATHTIME'].isnull()]\n",
"adm_notes = adm_notes.sample(n = len(adm_notes))\n",
"adm_notes = adm_notes.reset_index(drop = True)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SUBJECT_ID</th>\n",
" <th>HADM_ID</th>\n",
" <th>ADMITTIME</th>\n",
" <th>DISCHTIME</th>\n",
" <th>DAYS_TO_READMISSION</th>\n",
" <th>NEXT_ADMITTIME</th>\n",
" <th>ADMISSION_TYPE</th>\n",
" <th>DEATHTIME</th>\n",
" <th>TEXT</th>\n",
" <th>READM_WITHIN_30</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6090</td>\n",
" <td>175043</td>\n",
" <td>2170-05-03 07:15:00</td>\n",
" <td>2170-05-06 13:40:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>ELECTIVE</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2170-5-3**] D...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>27901</td>\n",
" <td>189210</td>\n",
" <td>2101-06-21 07:15:00</td>\n",
" <td>2101-07-13 15:00:00</td>\n",
" <td>50.0</td>\n",
" <td>2101-09-01 20:44:00</td>\n",
" <td>ELECTIVE</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2101-6-21**] ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>69531</td>\n",
" <td>102759</td>\n",
" <td>2156-08-18 23:41:00</td>\n",
" <td>2156-08-26 16:45:00</td>\n",
" <td>8.0</td>\n",
" <td>2156-09-03 21:11:00</td>\n",
" <td>EMERGENCY</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2156-8-18**] ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>8620</td>\n",
" <td>148993</td>\n",
" <td>2190-02-05 17:13:00</td>\n",
" <td>2190-02-09 17:53:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>ELECTIVE</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2190-2-5**] Discharge ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>24226</td>\n",
" <td>118785</td>\n",
" <td>2113-04-04 07:30:00</td>\n",
" <td>2113-04-14 11:20:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>ELECTIVE</td>\n",
" <td>NaT</td>\n",
" <td>Name: [**Known lastname 10030**],[**Known fir...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45316</th>\n",
" <td>17882</td>\n",
" <td>157780</td>\n",
" <td>2146-05-01 16:33:00</td>\n",
" <td>2146-05-09 16:20:00</td>\n",
" <td>412.0</td>\n",
" <td>2147-06-25 17:42:00</td>\n",
" <td>EMERGENCY</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2146-5-1**] D...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45317</th>\n",
" <td>2184</td>\n",
" <td>171742</td>\n",
" <td>2154-04-21 19:25:00</td>\n",
" <td>2154-04-25 11:49:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>EMERGENCY</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2154-4-21**] Discharge...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45318</th>\n",
" <td>28240</td>\n",
" <td>151747</td>\n",
" <td>2195-06-21 07:27:00</td>\n",
" <td>2195-06-26 14:33:00</td>\n",
" <td>154.0</td>\n",
" <td>2195-11-28 00:15:00</td>\n",
" <td>EMERGENCY</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2195-6-21**] ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45319</th>\n",
" <td>25201</td>\n",
" <td>124241</td>\n",
" <td>2149-06-02 08:00:00</td>\n",
" <td>2149-06-11 13:00:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>ELECTIVE</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2149-6-2**] D...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45320</th>\n",
" <td>20855</td>\n",
" <td>108604</td>\n",
" <td>2161-07-31 17:00:00</td>\n",
" <td>2161-08-20 16:00:00</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>EMERGENCY</td>\n",
" <td>NaT</td>\n",
" <td>Admission Date: [**2161-7-31**] Dischar...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>45321 rows × 10 columns</p>\n",
"</div>"
],
"text/plain": [
" SUBJECT_ID HADM_ID ADMITTIME DISCHTIME \\\n",
"0 6090 175043 2170-05-03 07:15:00 2170-05-06 13:40:00 \n",
"1 27901 189210 2101-06-21 07:15:00 2101-07-13 15:00:00 \n",
"2 69531 102759 2156-08-18 23:41:00 2156-08-26 16:45:00 \n",
"3 8620 148993 2190-02-05 17:13:00 2190-02-09 17:53:00 \n",
"4 24226 118785 2113-04-04 07:30:00 2113-04-14 11:20:00 \n",
"... ... ... ... ... \n",
"45316 17882 157780 2146-05-01 16:33:00 2146-05-09 16:20:00 \n",
"45317 2184 171742 2154-04-21 19:25:00 2154-04-25 11:49:00 \n",
"45318 28240 151747 2195-06-21 07:27:00 2195-06-26 14:33:00 \n",
"45319 25201 124241 2149-06-02 08:00:00 2149-06-11 13:00:00 \n",
"45320 20855 108604 2161-07-31 17:00:00 2161-08-20 16:00:00 \n",
"\n",
" DAYS_TO_READMISSION NEXT_ADMITTIME ADMISSION_TYPE DEATHTIME \\\n",
"0 NaN NaT ELECTIVE NaT \n",
"1 50.0 2101-09-01 20:44:00 ELECTIVE NaT \n",
"2 8.0 2156-09-03 21:11:00 EMERGENCY NaT \n",
"3 NaN NaT ELECTIVE NaT \n",
"4 NaN NaT ELECTIVE NaT \n",
"... ... ... ... ... \n",
"45316 412.0 2147-06-25 17:42:00 EMERGENCY NaT \n",
"45317 NaN NaT EMERGENCY NaT \n",
"45318 154.0 2195-11-28 00:15:00 EMERGENCY NaT \n",
"45319 NaN NaT ELECTIVE NaT \n",
"45320 NaN NaT EMERGENCY NaT \n",
"\n",
" TEXT READM_WITHIN_30 \n",
"0 Admission Date: [**2170-5-3**] D... 0 \n",
"1 Admission Date: [**2101-6-21**] ... 0 \n",
"2 Admission Date: [**2156-8-18**] ... 1 \n",
"3 Admission Date: [**2190-2-5**] Discharge ... 0 \n",
"4 Name: [**Known lastname 10030**],[**Known fir... 0 \n",
"... ... ... \n",
"45316 Admission Date: [**2146-5-1**] D... 0 \n",
"45317 Admission Date: [**2154-4-21**] Discharge... 0 \n",
"45318 Admission Date: [**2195-6-21**] ... 0 \n",
"45319 Admission Date: [**2149-6-2**] D... 0 \n",
"45320 Admission Date: [**2161-7-31**] Dischar... 0 \n",
"\n",
"[45321 rows x 10 columns]"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"adm_notes"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"adm_notes.to_csv(DIR + 'readmission.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Cleaning"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Natural Language"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"string.punctuation"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"def clean_text(texts):\n",
" texts = texts.fillna(' ')\n",
" texts = texts.str.replace('\\n',' ')\n",
" texts = texts.str.replace('\\r',' ')\n",
"\n",
" table = str.maketrans('', '', string.punctuation + '0123456789')\n",
" texts = [text.lower().translate(table) for text in texts]\n",
"\n",
" return texts"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"adm_notes['TEXT'] = clean_text(adm_notes['TEXT'])"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"stop_words = stopwords.words('english')\n",
"stop_words = stop_words + ['patient', 'date', 'admission', 'discharge', 'lastname', 'firstname', 'sex']"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"porter = PorterStemmer()"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"def tokenize_stem(text):\n",
" words = word_tokenize(text)\n",
" words = [word for word in words if word not in stop_words]\n",
" words = [porter.stem(word) for word in words]\n",
" return words"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2963\n",
"45321\n"
]
}
],
"source": [
"print(sum(adm_notes['READM_WITHIN_30'] == 1) )\n",
"print(len(adm_notes['READM_WITHIN_30']))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}