--- a
+++ b/data-exploration-cleaning.ipynb
@@ -0,0 +1,1653 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loading the data and required libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import re\n",
+    "import datetime\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "import string\n",
+    "import nltk\n",
+    "from nltk import word_tokenize\n",
+    "from nltk.stem.porter import PorterStemmer\n",
+    "from nltk.corpus import stopwords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DIR = \"E:/Coding/Summer 2023/data/\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "notes_df = pd.read_csv(DIR + \"NOTEEVENTS.csv\", low_memory=False, memory_map=True)\n",
+    "admissions_df = pd.read_csv(DIR + \"ADMISSIONS.csv\", low_memory=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data Exploration\n",
+    "## Admissions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ROW_ID</th>\n",
+       "      <th>SUBJECT_ID</th>\n",
+       "      <th>HADM_ID</th>\n",
+       "      <th>ADMITTIME</th>\n",
+       "      <th>DISCHTIME</th>\n",
+       "      <th>DEATHTIME</th>\n",
+       "      <th>ADMISSION_TYPE</th>\n",
+       "      <th>ADMISSION_LOCATION</th>\n",
+       "      <th>DISCHARGE_LOCATION</th>\n",
+       "      <th>INSURANCE</th>\n",
+       "      <th>LANGUAGE</th>\n",
+       "      <th>RELIGION</th>\n",
+       "      <th>MARITAL_STATUS</th>\n",
+       "      <th>ETHNICITY</th>\n",
+       "      <th>EDREGTIME</th>\n",
+       "      <th>EDOUTTIME</th>\n",
+       "      <th>DIAGNOSIS</th>\n",
+       "      <th>HOSPITAL_EXPIRE_FLAG</th>\n",
+       "      <th>HAS_CHARTEVENTS_DATA</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>21</td>\n",
+       "      <td>22</td>\n",
+       "      <td>165315</td>\n",
+       "      <td>2196-04-09 12:26:00</td>\n",
+       "      <td>2196-04-10 15:54:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>EMERGENCY ROOM ADMIT</td>\n",
+       "      <td>DISC-TRAN CANCER/CHLDRN H</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>UNOBTAINABLE</td>\n",
+       "      <td>MARRIED</td>\n",
+       "      <td>WHITE</td>\n",
+       "      <td>2196-04-09 10:06:00</td>\n",
+       "      <td>2196-04-09 13:24:00</td>\n",
+       "      <td>BENZODIAZEPINE OVERDOSE</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>22</td>\n",
+       "      <td>23</td>\n",
+       "      <td>152223</td>\n",
+       "      <td>2153-09-03 07:15:00</td>\n",
+       "      <td>2153-09-08 19:10:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>PHYS REFERRAL/NORMAL DELI</td>\n",
+       "      <td>HOME HEALTH CARE</td>\n",
+       "      <td>Medicare</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>CATHOLIC</td>\n",
+       "      <td>MARRIED</td>\n",
+       "      <td>WHITE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>CORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>23</td>\n",
+       "      <td>23</td>\n",
+       "      <td>124321</td>\n",
+       "      <td>2157-10-18 19:34:00</td>\n",
+       "      <td>2157-10-25 14:00:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>TRANSFER FROM HOSP/EXTRAM</td>\n",
+       "      <td>HOME HEALTH CARE</td>\n",
+       "      <td>Medicare</td>\n",
+       "      <td>ENGL</td>\n",
+       "      <td>CATHOLIC</td>\n",
+       "      <td>MARRIED</td>\n",
+       "      <td>WHITE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>BRAIN MASS</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>161859</td>\n",
+       "      <td>2139-06-06 16:14:00</td>\n",
+       "      <td>2139-06-09 12:48:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>TRANSFER FROM HOSP/EXTRAM</td>\n",
+       "      <td>HOME</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>PROTESTANT QUAKER</td>\n",
+       "      <td>SINGLE</td>\n",
+       "      <td>WHITE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>INTERIOR MYOCARDIAL INFARCTION</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>25</td>\n",
+       "      <td>25</td>\n",
+       "      <td>129635</td>\n",
+       "      <td>2160-11-02 02:06:00</td>\n",
+       "      <td>2160-11-05 14:55:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>EMERGENCY ROOM ADMIT</td>\n",
+       "      <td>HOME</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>UNOBTAINABLE</td>\n",
+       "      <td>MARRIED</td>\n",
+       "      <td>WHITE</td>\n",
+       "      <td>2160-11-02 01:01:00</td>\n",
+       "      <td>2160-11-02 04:27:00</td>\n",
+       "      <td>ACUTE CORONARY SYNDROME</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   ROW_ID  SUBJECT_ID  HADM_ID            ADMITTIME            DISCHTIME  \\\n",
+       "0      21          22   165315  2196-04-09 12:26:00  2196-04-10 15:54:00   \n",
+       "1      22          23   152223  2153-09-03 07:15:00  2153-09-08 19:10:00   \n",
+       "2      23          23   124321  2157-10-18 19:34:00  2157-10-25 14:00:00   \n",
+       "3      24          24   161859  2139-06-06 16:14:00  2139-06-09 12:48:00   \n",
+       "4      25          25   129635  2160-11-02 02:06:00  2160-11-05 14:55:00   \n",
+       "\n",
+       "  DEATHTIME ADMISSION_TYPE         ADMISSION_LOCATION  \\\n",
+       "0       NaN      EMERGENCY       EMERGENCY ROOM ADMIT   \n",
+       "1       NaN       ELECTIVE  PHYS REFERRAL/NORMAL DELI   \n",
+       "2       NaN      EMERGENCY  TRANSFER FROM HOSP/EXTRAM   \n",
+       "3       NaN      EMERGENCY  TRANSFER FROM HOSP/EXTRAM   \n",
+       "4       NaN      EMERGENCY       EMERGENCY ROOM ADMIT   \n",
+       "\n",
+       "          DISCHARGE_LOCATION INSURANCE LANGUAGE           RELIGION  \\\n",
+       "0  DISC-TRAN CANCER/CHLDRN H   Private      NaN       UNOBTAINABLE   \n",
+       "1           HOME HEALTH CARE  Medicare      NaN           CATHOLIC   \n",
+       "2           HOME HEALTH CARE  Medicare     ENGL           CATHOLIC   \n",
+       "3                       HOME   Private      NaN  PROTESTANT QUAKER   \n",
+       "4                       HOME   Private      NaN       UNOBTAINABLE   \n",
+       "\n",
+       "  MARITAL_STATUS ETHNICITY            EDREGTIME            EDOUTTIME  \\\n",
+       "0        MARRIED     WHITE  2196-04-09 10:06:00  2196-04-09 13:24:00   \n",
+       "1        MARRIED     WHITE                  NaN                  NaN   \n",
+       "2        MARRIED     WHITE                  NaN                  NaN   \n",
+       "3         SINGLE     WHITE                  NaN                  NaN   \n",
+       "4        MARRIED     WHITE  2160-11-02 01:01:00  2160-11-02 04:27:00   \n",
+       "\n",
+       "                                           DIAGNOSIS  HOSPITAL_EXPIRE_FLAG  \\\n",
+       "0                            BENZODIAZEPINE OVERDOSE                     0   \n",
+       "1  CORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS...                     0   \n",
+       "2                                         BRAIN MASS                     0   \n",
+       "3                     INTERIOR MYOCARDIAL INFARCTION                     0   \n",
+       "4                            ACUTE CORONARY SYNDROME                     0   \n",
+       "\n",
+       "   HAS_CHARTEVENTS_DATA  \n",
+       "0                     1  \n",
+       "1                     1  \n",
+       "2                     1  \n",
+       "3                     1  \n",
+       "4                     1  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "admissions_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(58976, 19)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "admissions_df.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Types of admissions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['EMERGENCY', 'ELECTIVE', 'NEWBORN', 'URGENT'], dtype=object)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "admissions_df['ADMISSION_TYPE'].unique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check for missing values on the admission times"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sum(admissions_df['ADMITTIME'].isnull())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Conversion of times to datetime type"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "admissions_df['ADMITTIME'] = admissions_df['ADMITTIME'].astype('datetime64[ns]')\n",
+    "admissions_df['DISCHTIME'] = admissions_df['DISCHTIME'].astype('datetime64[ns]')\n",
+    "admissions_df['DEATHTIME'] = admissions_df['DEATHTIME'].astype('datetime64[ns]')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Sort by subject and admission type and reset the data frame index."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "admissions_df = admissions_df.sort_values(['SUBJECT_ID', 'ADMITTIME'])\n",
+    "admissions_df = admissions_df.reset_index(drop = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "admissions_df['NEXT_ADMITTIME'] = admissions_df.groupby('SUBJECT_ID').ADMITTIME.shift(-1)\n",
+    "admissions_df['NEXT_ADMISSION_TYPE'] = admissions_df.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "admissions_df.loc[admissions_df['NEXT_ADMISSION_TYPE'] == 'ELECTIVE', 'NEXT_ADMITTIME'] = pd.NaT\n",
+    "admissions_df.loc[admissions_df['NEXT_ADMISSION_TYPE'] == 'ELECTIVE', 'NEXT_ADMISSION_TYPE'] = np.NaN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Fill NA's with the next valid value. Previously sorted."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "admissions_df[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']] = admissions_df.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']].fillna(method = 'bfill')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Obtain days to readmission: from discharge to next readmission"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "admissions_df['DAYS_TO_READMISSION'] = (admissions_df['NEXT_ADMITTIME'] - admissions_df['DISCHTIME']).dt.days"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Number of readmissions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "11399"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sum(admissions_df['DAYS_TO_READMISSION'].notnull())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Distribution of days to readmission"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnYAAAFRCAYAAAD5FeDqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8o6BhiAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA5XElEQVR4nO3de3zPdf/H8edm5znkuIlE9LWWaWNfbb9aSSxX4Wo5hRESRVQukUMiDZdjFyXkXMlpk9BVUl2RmE3I1SwppJgZ09AOZp/fH932vXxtfL+bw3c+Hvfbbbebvd/vz+vz/u5tefY5uhmGYQgAAAA3PHdXTwAAAABXB8EOAADAJAh2AAAAJkGwAwAAMAmCHQAAgEkQ7AAAAEyCYAcAAGASBDsAAACT8HD1BG5EO3fulGEY8vT0dPVUAACAyZ07d05ubm4KCwtzOJZgVwqGYYgXdgAAgOuhJJmDYFcKhUfqQkJCXDwTAABgdnv27HF6LNfYAQAAmATBDgAAwCQIdgAAACZBsAMAADAJgh0AAIBJEOwAAABMgmAHAABgEgQ7AAAAkyDYAQAAmATBDgAAwCQIdmVYQYFr30fr6v0DAICScem7YhMTE9WjR49i+2rXrq0vvvhCv/32m8aNG6ekpCT5+fmpQ4cOGjhwoMqVK2cb+8EHH2jBggU6fvy4GjVqpFGjRik4ONjW70yNssjd3U3LP9+n45l/Xvd9V6/sp86tLNd9vwAAoPRcGuzCwsL0zTff2LXt2rVLAwcOVP/+/XXu3Dk9/fTTqlu3rpYtW6Zff/1VI0eOlLu7uwYNGiRJWr16tSZNmqRx48YpODhYc+fOVa9evfTvf/9bVapUcapGWXY8808dyTjr6mkAAIAbgEuDnZeXl6pXr277/s8//9SECRMUExOj9u3ba926dTpy5IhWrFihSpUqyWKx6MSJE5o0aZKeffZZeXl5afbs2YqNjVW7du0kSePHj1fLli21cuVK9evXT5999pnDGgAAAGZQpq6xmz17trKzszVs2DBJUnJysu6++25VqlTJNiYiIkJnzpzR3r17deLECR08eFCRkZG2fg8PD4WHhyspKcmpGgAAAGZRZoLdyZMntWjRIj377LO65ZZbJElpaWkKDAy0G1ejRg1J0tGjR5WWliZJqlmzZpExhX2OagAAAJhFmQl2S5cuVYUKFdS5c2dbW05OTpFTpd7e3pKk3NxcZWdnS1KxY3Jzc52qAQAAYBZlJth99NFHevzxx+Xj42Nr8/HxUV5ent24wjDm5+dnG1vcGF9fX6dqAAAAmEWZCHapqak6fPiw2rZta9ceGBio9PR0u7bC7wMCAmynYIsbExAQ4FQNAAAAsygTwS45OVlVq1ZVUFCQXbvValVKSorOnDlja9u2bZv8/f0VFBSkqlWrql69ekpMTLT15+fnKzk5WVar1akaAAAAZlEmgl1KSooaNmxYpL1ly5aqXr26XnzxRaWmpmrjxo2aNm2aevfubbturnfv3lq4cKFWr16t/fv3a8SIEcrJyVGHDh2crgEAAGAGLn2OXaHjx4/b7oS9kLe3t+bNm6exY8eqU6dOqlSpkrp27ar+/fvbxnTq1EmnT5/Wm2++qVOnTqlRo0ZauHChqlSp4nQNAAAAM3AzDIMXgpbQnj17JEkhISHXfF9vrdjlkjdP3FrNX893Cr3u+wUAAPZKkjvKxKlYAAAAXDmCHQAAgEkQ7AAAAEyCYAcAAGASBDsAAACTINgBAACYBMEOAADAJAh2AAAAJkGwAwAAMAmCHQAAgEkQ7AAAAEyCYAcAAGASBDsAAACTINgBAACYBMEOAADAJAh2AAAAJkGwAwAAMAmCHQAAgEkQ7AAAAEyCYAcAAGASBDsAAACTINgBAACYBMEOAADAJAh2AAAAJlEmgt1HH32kRx99VCEhIXrsscf073//29b322+/qV+/fmrSpInuv/9+vfnmmzp//rzd9h988IEefvhhNW7cWF27dlVKSopdvzM1AAAAbnQuD3Zr1qzRyJEj1a1bN61fv15t2rTR4MGDtXPnTp07d05PP/20JGnZsmUaM2aMPvzwQ7399tu27VevXq1JkybphRdeUEJCgmrXrq1evXrp5MmTkuRUDQAAADPwcOXODcPQv/71L/Xo0UPdunWTJD333HNKTk7W9u3b9fvvv+vIkSNasWKFKlWqJIvFohMnTmjSpEl69tln5eXlpdmzZys2Nlbt2rWTJI0fP14tW7bUypUr1a9fP3322WcOawAAAJiBS4/YHThwQL///rvatm1r1z5//nz169dPycnJuvvuu1WpUiVbX0REhM6cOaO9e/fqxIkTOnjwoCIjI239Hh4eCg8PV1JSkiQ5rAEAAGAWLg92kvTnn3/q6aefVmRkpDp27Kgvv/xSkpSWlqbAwEC7bWrUqCFJOnr0qNLS0iRJNWvWLDKmsM9RDQAAALNwabA7c+aMJGnYsGFq06aNFixYoPvuu0/9+/fX1q1blZOTU+RUqbe3tyQpNzdX2dnZklTsmNzcXElyWAMAAMAsXHqNnaenpyTp6aefVkxMjCTprrvuUkpKihYuXCgfHx/l5eXZbVMYxvz8/OTj4yNJxY7x9fWVJIc1AAAAzMKlR+wCAgIkSRaLxa69QYMG+u233xQYGKj09HS7vsLvAwICbKdgixtTWNtRDQAAALNwabC7++675e/vr927d9u179u3T3Xq1JHValVKSortlK0kbdu2Tf7+/goKClLVqlVVr149JSYm2vrz8/OVnJwsq9UqSQ5rAAAAmIVLg52Pj4/69Omjt99+W+vWrdOvv/6qd955R1u2bFGvXr3UsmVLVa9eXS+++KJSU1O1ceNGTZs2Tb1797ZdN9e7d28tXLhQq1ev1v79+zVixAjl5OSoQ4cOkuRUDQAAADNw6TV2ktS/f3/5+vpq+vTpOnbsmOrXr6+ZM2fq3nvvlSTNmzdPY8eOVadOnVSpUiV17dpV/fv3t23fqVMnnT59Wm+++aZOnTqlRo0aaeHChapSpYqkv26UcFQDAADADNwMwzBcPYkbzZ49eyRJISEh13xfb63YpSMZZ6/5fi52azV/Pd8p9LrvFwAA2CtJ7nD5K8UAAABwdRDsAAAATIJgBwAAYBIEOwAAAJMg2AEAAJgEwQ4AAMAkCHYAAAAmQbADAAAwCYIdAACASRDsAAAATIJgBwAAYBIEOwAAAJMg2AEAAJgEwQ4AAMAkCHYAAAAmQbADAAAwCYIdAACASRDsAAAATIJgBwAAYBIEOwAAAJMg2AEAAJgEwQ4AAMAkCHYAAAAmQbADAAAwCZcHu2PHjqlhw4ZFvhISEiRJe/fuVWxsrEJDQ9WiRQstWbLEbvuCggLNmDFDUVFRCg0N1TPPPKPDhw/bjXFUAwAAwAw8XD2B1NRUeXt7a+PGjXJzc7O1V6hQQZmZmerVq5datGihsWPHateuXRo7dqz8/f3Vvn17SdKsWbO0dOlSTZw4UYGBgZo8ebL69OmjtWvXysvLy6kaAAAAZuDyYLdv3z7VrVtXNWrUKNK3ePFieXp66vXXX5eHh4fq16+vQ4cOae7cuWrfvr3y8vK0YMECDRkyRM2bN5ckTZ8+XVFRUdqwYYPatGmjFStWXLYGAACAWbj8VOyPP/6o+vXrF9uXnJysZs2aycPjf/kzIiJCBw8eVEZGhlJTU3X27FlFRkba+itWrKjg4GAlJSU5VQMAAMAsXB7s9u3bp5MnT6pbt276v//7P3Xp0kWbNm2SJKWlpSkwMNBufOGRvaNHjyotLU2SVLNmzSJjCvsc1QAAADALlwa7/Px8/fLLL/rjjz80cOBAzZ07V6Ghoerbt6+2bt2qnJwceXl52W3j7e0tScrNzVV2drYkFTsmNzdXkhzWAAAAMAuXXmPn4eGhxMRElStXTj4+PpKkRo0a6aefftL8+fPl4+OjvLw8u20Kw5ifn59tm7y8PNufC8f4+vpKksMaAAAAZuHyU7H+/v52oUyS7rzzTh07dkyBgYFKT0+36yv8PiAgwHYKtrgxAQEBkuSwBgAAgFm4NNj99NNPatKkiRITE+3a//vf/6pBgwayWq3asWOHzp8/b+vbtm2b6tWrp6pVqyooKEjly5e32z4rK0spKSmyWq2S5LAGAACAWbg02NWvX1933HGHXn/9dSUnJ+vnn3/WhAkTtGvXLj333HNq3769zpw5o5EjR2r//v1KSEjQokWL1K9fP0l/XVsXGxurKVOm6IsvvlBqaqpeeuklBQYGKjo6WpIc1gAAADALl15j5+7urtmzZ2vq1Kl68cUXlZWVpeDgYC1cuFAWi0WSNG/ePMXFxSkmJkbVq1fX0KFDFRMTY6sxaNAg5efna9SoUcrJyZHVatX8+fPl6ekpSapatarDGgAAAGbgZhiG4epJ3Gj27NkjSQoJCbnm+3prxS4dyTh7zfdzsVur+ev5TqHXfb8AAMBeSXKHy2+eAAAAwNVBsAMAADAJgh0AAIBJEOwAAABMgmAHAABgEgQ7AAAAkyDYAQAAmATBDgAAwCQIdgAAACZBsAMAADAJgh0AAIBJEOwAAABMgmAHAABgEgQ7AAAAkyDYAQAAmATBDgAAwCQIdgAAACZxTYJdWlratSgLAACAyyhVsLvrrrv0/fffF9uXnJysv/3tb1c0KQAAAJSch7MDFyxYoD///FOSZBiGVq5cqU2bNhUZt3PnTnl5eV29GQIAAMApTge73NxcvfXWW5IkNzc3rVy5ssgYd3d3VahQQc8999zVmyEAAACc4nSwe+6552yBLSgoSCtWrFDjxo2v2cQAAABQMk4HuwulpqZe7XkAAADgCpUq2EnSli1b9NVXXyk7O1sFBQV2fW5ubho/fvwVTw4AAADOK9VdsQsWLNDTTz+tlStX6ttvv1ViYmKRr9I4cOCAwsLClJCQYGvbu3evYmNjFRoaqhYtWmjJkiV22xQUFGjGjBmKiopSaGionnnmGR0+fNhujKMaAAAAZlCqI3bvv/++2rZtq7i4uKt2B+y5c+c0ZMgQ2523kpSZmalevXqpRYsWGjt2rHbt2qWxY8fK399f7du3lyTNmjVLS5cu1cSJExUYGKjJkyerT58+Wrt2rby8vJyqAQAAYAalCnYZGRnq0KHDVX2sycyZM1W+fHm7thUrVsjT01Ovv/66PDw8VL9+fR06dEhz585V+/btlZeXpwULFmjIkCFq3ry5JGn69OmKiorShg0b1KZNG4c1AAAAzKJUp2KDg4P1008/XbVJJCUlafny5Zo4caJde3Jyspo1ayYPj//lz4iICB08eFAZGRlKTU3V2bNnFRkZaeuvWLGigoODlZSU5FQNAAAAsyjVEbsRI0boxRdflJ+fn+655x75+voWGXPrrbc6VSsrK0tDhw7VqFGjVLNmTbu+tLQ0WSwWu7YaNWpIko4ePWp7ddnF29WoUcPW56hGtWrVnJonAABAWVeqYNelSxcVFBRoxIgRcnNzK3bM3r17nao1ZswYhYWFqW3btkX6cnJyipzu9fb2lvTXA5Ozs7Mlqdgxf/zxh1M1AAAAzKJUwW7cuHGXDHQl8dFHHyk5OVlr164ttt/Hx0d5eXl2bYVhzM/PTz4+PpKkvLw8258LxxQeRXRUAwAAwCxKFeyeeOKJq7Lz+Ph4nThxwnbjQ6HXXntNn3zyiQIDA5Wenm7XV/h9QECA8vPzbW116tSxG9OwYUNJclgDAADALEoV7ApvTLgcq9XqcMyUKVOUk5Nj1xYdHa1BgwapXbt2WrNmjZYtW6bz58+rXLlykqRt27apXr16qlq1qipUqKDy5csrMTHRFuyysrKUkpKi2NhY2zwuVwMAAMAsShXsunfvLjc3NxmGYWu7+NSsM9fYXeqIWdWqVRUQEKD27dtr3rx5GjlypPr06aPvv/9eixYt0tixYyX9dW1dbGyspkyZoipVqqhWrVqaPHmyAgMDFR0dLUkOawAAAJhFqYJdcW9u+PPPP5WcnKw1a9Zo5syZVzwx6a+AN2/ePMXFxSkmJkbVq1fX0KFDFRMTYxszaNAg5efna9SoUcrJyZHVatX8+fPl6enpdA0AAAAzcDMuPOx2FcyaNUu7d+/WnDlzrmbZMmXPnj2SpJCQkGu+r7dW7NKRjLPXfD8Xu7Wav57vFHrd9wsAAOyVJHeU6gHFlxMeHq7t27df7bIAAABw4KoHuy+//FL+/v5XuywAAAAcKNU1dj169CjSVlBQoLS0NP3+++965plnrnhiAAAAKJlSBbviLstzd3eXxWJRv3791L59+yueGAAAAEqmVMHuvffeu9rzAAAAwBUqVbArtGnTJm3fvl1ZWVmqUqWKmjZtqqioqKs1NwAAAJRAqYJdXl6e+vfvr2+++UblypVT5cqVlZmZqTlz5igiIkJz5syRl5fX1Z4rAAAALqNUd8XOnDlTO3bs0KRJk/T999/rm2++0e7duzVhwgTt2rVL77zzztWeJwAAABwoVbBbt26dnn/+ebVr1872/lUPDw89/vjjev7557V27dqrOkkAAAA4Vqpgd/LkSQUHBxfbFxwcrGPHjl3RpAAAAFBypQp2derU0Y4dO4rtS0pKUs2aNa9oUgAAACi5Ut088eSTT2rixIny8fHRY489pmrVqikjI0Pr1q3Tu+++q+eff/5qzxMAAAAOlCrYdenSRSkpKZoyZYqmTp1qazcMQzExMerbt+9VmyAAAACcU+rHncTFxal3797avn27/vjjD7m5ually5aqX7/+1Z4jAAAAnFCia+x+/PFHtW/fXgsXLpQk1a9fX126dFHXrl31r3/9S4MHD9aBAweuyUQBAABweU4Hu99++009evRQRkaG6tWrZ9fn6empoUOH6tSpU+ratSt3xQIAALiA08Fu7ty5uuWWW7R69Wq1bt3ars/X11c9e/bUqlWr5O3trTlz5lz1iQIAAODynA52W7duVZ8+fVSlSpVLjqlevbp69+6tLVu2XJXJAQAAwHlOB7v09HTVrVvX4TiLxaK0tLQrmRMAAABKwelgV6VKFaWnpzscl5mZqUqVKl3RpAAAAFByTgc7q9WqhIQEh+M++uijS75uDAAAANeO08Gue/fuSkxM1MSJE5Wbm1ukPy8vT5MmTdKmTZvUrVu3qzpJAAAAOOb0A4pDQkI0fPhwjR8/XmvWrFFkZKRq166t8+fP68iRI0pMTFRmZqZeeOEFRUVFXcs5AwAAoBglevNEt27dFBQUpPnz5+uLL76wHbnz9/fX/fffr969e+uee+65JhMFAADA5ZX4lWJNmzZV06ZNJUknT56Uh4eHKlasWOoJnDhxQhMnTtTmzZuVm5srq9WqYcOG2V5NtnfvXsXFxem///2vqlSpop49e6pHjx627QsKCvTWW29p5cqVOn36tKxWq0aPHq3bbrvNNsZRDQAAADMo0SvFLlalSpUrCnWSNGDAAB06dEhz587VqlWr5OPjo549eyo7O1uZmZnq1auX6tSpo/j4eA0YMEBTpkxRfHy8bftZs2Zp6dKlGjdunJYtW6aCggL16dNHeXl5kuRUDQAAADMo8RG7q+mPP/5QrVq11K9fP1ksFklS//799fe//10//fSTtm7dKk9PT73++uvy8PBQ/fr1bSGwffv2ysvL04IFCzRkyBA1b95ckjR9+nRFRUVpw4YNatOmjVasWHHZGgAAAGZxRUfsrlSlSpU0depUW6g7efKkFi1apMDAQDVo0EDJyclq1qyZPDz+lz8jIiJ08OBBZWRkKDU1VWfPnlVkZKStv2LFigoODlZSUpIkOawBAABgFi49YnehV199VStWrJCXl5feeecd+fn5KS0tzRb6CtWoUUOSdPToUdsbLmrWrFlkTGGfoxrVqlW7Jp8HAADgenPpEbsLPfXUU4qPj1ebNm00YMAA/fDDD8rJyZGXl5fdOG9vb0lSbm6usrOzJanYMYV37DqqAQAAYBZl5ohdgwYNJElxcXHavXu33n//ffn4+NhugihUGMb8/Pzk4+Mj6a+HIxf+uXCMr6+vJDmsAQAAYBYuPWJ38uRJrV+/Xvn5+bY2d3d3NWjQQOnp6QoMDCzyftrC7wMCAmynYIsbExAQIEkOawAAAJiFS4NdRkaGBg8erK1bt9razp07p5SUFNWvX19Wq1U7duzQ+fPnbf3btm1TvXr1VLVqVQUFBal8+fJKTEy09WdlZSklJUVWq1WSHNYAAAAwC5cGO4vFogceeEBvvPGGkpKStG/fPr3yyivKyspSz5491b59e505c0YjR47U/v37lZCQoEWLFqlfv36S/rq2LjY2VlOmTNEXX3yh1NRUvfTSSwoMDFR0dLQkOawBAABgFi6/xm7atGmaOnWqXnrpJZ0+fVrh4eH64IMPdOutt0qS5s2bp7i4OMXExKh69eoaOnSoYmJibNsPGjRI+fn5GjVqlHJycmS1WjV//nx5enpKkqpWreqwBgAAgBm4GYZhuHoSN5o9e/ZIkkJCQq75vt5asUtHMs5e8/1c7NZq/nq+U+h13y8AALBXktxRZh53AgAAgCtDsAMAADAJgh0AAIBJEOwAAABMgmAHAABgEgQ7AAAAkyDYAQAAmATBDgAAwCQIdgAAACZBsAMAADAJgh0AAIBJEOwAAABMgmAHAABgEgQ7AAAAkyDYAQAAmATBDgAAwCQIdgAAACZBsAMAADAJgh0AAIBJEOwAAABMgmAHAABgEgQ7AAAAkyDYAQAAmITLg92pU6c0evRoPfDAA2rSpIm6dOmi5ORkW//WrVv1xBNP6J577lHr1q21fv16u+1zc3M1duxYRUZGKiwsTP/4xz908uRJuzGOagAAAJiBy4Pd4MGDtXPnTk2bNk3x8fG666679PTTT+uXX37Rzz//rH79+ikqKkoJCQnq2LGjhg4dqq1bt9q2HzNmjL755hvNnDlTixcv1i+//KJBgwbZ+p2pAQAAYAYertz5oUOHtGXLFi1dulRNmzaVJL366qvavHmz1q5dqxMnTqhhw4Z66aWXJEn169dXSkqK5s2bp8jISB07dkwfffSRZs+erfDwcEnStGnT1Lp1a+3cuVNhYWFavHjxZWsAAACYhUuP2FWuXFlz585VSEiIrc3NzU1ubm7KyspScnJykfAVERGhHTt2yDAM7dixw9ZWqF69egoICFBSUpIkOawBAABgFi4NdhUrVtSDDz4oLy8vW9tnn32mQ4cOKSoqSmlpaQoMDLTbpkaNGsrOzlZmZqaOHTumypUry9vbu8iYtLQ0SXJYAwAAwCxcfo3dhb777jsNHz5c0dHRat68uXJycuxCnyTb93l5ecrOzi7SL0ne3t7Kzc2VJIc1AAAAzKLMBLuNGzeqd+/eCg0N1ZQpUyT9FdAuDl+F3/v6+srHx6fYcJabmytfX1+nagAAAJhFmQh277//vgYOHKiHHnpIs2fPtp1arVmzptLT0+3Gpqeny8/PTxUqVFBgYKBOnTpVJLilp6crICDAqRoAAABm4fJgt3TpUo0bN07dunXTtGnT7E6bhoeHa/v27Xbjt23bpiZNmsjd3V1NmzZVQUGB7SYKSTpw4ICOHTsmq9XqVA0AAACzcGmyOXDggMaPH69WrVqpX79+ysjI0PHjx3X8+HGdPn1a3bt31/fff68pU6bo559/1oIFC/Tpp5+qT58+kqSAgAA99thjGjVqlBITE/X9999r8ODBatasmUJDQyXJYQ0AAACzcOlz7D777DOdO3dOn3/+uT7//HO7vpiYGE2cOFGzZs3S5MmTtXjxYtWuXVuTJ0+2e3zJuHHjNH78eD3//POSpAceeECjRo2y9d95550OawAAAJiBm8HD3Epsz549kmT3/L1r5a0Vu3Qk4+w138/Fbq3mr+c7hV73/QIAAHslyR1cZAYAAGASBDsAAACTINgBAACYBMEOAADAJAh2AAAAJkGwAwAAMAmCHQAAgEkQ7AAAAEyCYAcAAGASBDsAAACTINgBAACYBMEOAADAJAh2KFZ5P08VFBiunkaZmAMAADcKD1dPAGWTr5eH3N3dtPzzfTqe+adL5lC9sp86t7K4ZN8AANyICHa4rOOZf+pIxllXTwMAADiBU7EAAAAmQbADAAAwCYIdAACASRDsAAAATIJgBwAAYBIEOwAAAJMg2AEAAJgEwQ4AAMAkCHYAAAAmUaaC3Zw5c9S9e3e7tr179yo2NlahoaFq0aKFlixZYtdfUFCgGTNmKCoqSqGhoXrmmWd0+PDhEtUAAAAwgzIT7D744AO9+eabdm2ZmZnq1auX6tSpo/j4eA0YMEBTpkxRfHy8bcysWbO0dOlSjRs3TsuWLVNBQYH69OmjvLw8p2sAAACYgcvfFXvs2DG99tprSkxMVN26de36VqxYIU9PT73++uvy8PBQ/fr1dejQIc2dO1ft27dXXl6eFixYoCFDhqh58+aSpOnTpysqKkobNmxQmzZtHNYAAAAwC5cfsfvhhx/k6empjz/+WPfcc49dX3Jyspo1ayYPj//lz4iICB08eFAZGRlKTU3V2bNnFRkZaeuvWLGigoODlZSU5FQNAAAAs3D5EbsWLVqoRYsWxfalpaXJYrHYtdWoUUOSdPToUaWlpUmSatasWWRMYZ+jGtWqVbvyDwEAAFAGuPyI3eXk5OTIy8vLrs3b21uSlJubq+zsbEkqdkxubq5TNQAAAMyiTAc7Hx8f200QhQrDmJ+fn3x8fCSp2DG+vr5O1QAAADCLMh3sAgMDlZ6ebtdW+H1AQIDtFGxxYwICApyqAQAAYBZlOthZrVbt2LFD58+ft7Vt27ZN9erVU9WqVRUUFKTy5csrMTHR1p+VlaWUlBRZrVanagAAAJhFmQ527du315kzZzRy5Ejt379fCQkJWrRokfr16yfpr2vrYmNjNWXKFH3xxRdKTU3VSy+9pMDAQEVHRztVAwAAwCxcflfs5VStWlXz5s1TXFycYmJiVL16dQ0dOlQxMTG2MYMGDVJ+fr5GjRqlnJwcWa1WzZ8/X56enk7XAAAAMAM3wzAMV0/iRrNnzx5JUkhIyDXf11srdulIxtlrvp+L3dOgmjpHN3TZ/iXJUucW9Xg0WO7ubi7Zf6GCAsPlcwAA3LxKkjvK9BE73Nx8vTzk7u6m5Z/v0/HMP10yh+qV/dS5lcXxQAAAygCCHcq845l/uuyoIQAAN5IyffMEAAAAnEewAwAAMAmCHQAAgEkQ7AAAAEyCYAcAAGASBDsAAACTINgBAACYBMEOAADAJAh2wGWU9/NUQYHr37pXFuYAACj7ePMEcBm81gwAcCMh2AFO4LVmAIAbAadiAQAATIJgBwAAYBIEO6CM4wYOAICzuMYOKOO4gQMA4CyCHXCD4AYOAIAjnIoFAAAwCYIdAIe4zg8AbgycigXgENf5AcCNgWAHwGlc5wcAZRunYgHcEDgdDACOccQOwA2hLJwOvr1mRT12Xz2X7PtCBQWG3N3dXD0NAGXQTRPsCgoK9NZbb2nlypU6ffq0rFarRo8erdtuu83VUwNQAq48HVz9Fl/CpQiWQFl20wS7WbNmaenSpZo4caICAwM1efJk9enTR2vXrpWXl5erpwfgBnIzh0tuYgHKtpsi2OXl5WnBggUaMmSImjdvLkmaPn26oqKitGHDBrVp08a1EwSAEnJVuCy81tHVR+zKwhyAsuimCHapqak6e/asIiMjbW0VK1ZUcHCwkpKSCHYA4CSudfyfshAuy8IcULa4GYZh+lu8NmzYoIEDB2r37t3y8fGxtb/wwgvKycnRnDlzSlTvu+++k2EY1+UU7tnsczrvgrvwPD3c5evt4bL9MwfmwBzK5hxcvf8L55CTm6/zLvonzKOcu7w9y7l0DuXc3eXjVc4l+8b1lZeXJzc3NzVp0sTh2JviiF12drYkFQli3t7e+uOPP0pcz83t+v3fkb+v53XbV1ncP3NgDsyhbM7B1fuXJB9v1/8TVhbmAPNzc3NzOnvcFH8jC4/S5eXl2R2xy83Nla+vb4nrhYWFXbW5AQAAXC03xQOKa9asKUlKT0+3a09PT1dAQIArpgQAAHDV3RTBLigoSOXLl1diYqKtLSsrSykpKbJarS6cGQAAwNVzU5yK9fLyUmxsrKZMmaIqVaqoVq1amjx5sgIDAxUdHe3q6QEAAFwVN0Wwk6RBgwYpPz9fo0aNUk5OjqxWq+bPny9PT9dfAAwAAHA13BSPOwEAALgZ3BTX2AEAANwMCHYAAAAmQbADAAAwCYIdAACASRDsAAAATIJgBwAAYBIEOwAAAJMg2JVRBQUFmjFjhqKiohQaGqpnnnlGhw8fdvW04MCcOXPUvXt3u7a9e/cqNjZWoaGhatGihZYsWWLX78xaO6qBa+PUqVMaPXq0HnjgATVp0kRdunRRcnKyrX/r1q164okndM8996h169Zav3693fa5ubkaO3asIiMjFRYWpn/84x86efKk3RhHNXDtnDhxQi+//LIiIiIUFhamvn376ueff7b187trDgcOHFBYWJgSEhJsbaZeWwNl0syZM417773X+Oqrr4y9e/cavXv3NqKjo43c3FxXTw2X8P777xtBQUFGbGysre3kyZPGvffeawwfPtzYv3+/sWrVKiMkJMRYtWqVbYyjtXamBq6NXr16GW3atDGSkpKMX375xRg7dqzRuHFj4+effzb2799vhISEGNOmTTP2799vzJs3zwgODja+/fZb2/avvPKK0bJlSyMpKcnYvXu38fjjjxvdunWz9TtTA9dO586djY4dOxq7d+829u/fbwwcONC4//77jT///JPfXZPIy8sznnjiCcNisRjx8fGGYZj/v8sEuzIoNzfXCAsLMz744ANb2x9//GE0btzYWLt2rQtnhuKkpaUZ/fr1M0JDQ43WrVvbBbvZs2cb999/v3Hu3Dlb29SpU43o6GjDMJxba0c1cG0cPHjQsFgsRnJysq2toKDAaNmypfHmm28ar776qtGhQwe7bQYPHmz07t3bMIy//l4EBQUZ//nPf2z9v/zyi2GxWIzvvvvOMAzDYQ1cO6dOnTIGDx5s/Pjjj7a2vXv3GhaLxdi9eze/uyYxdepUo0ePHnbBzuxry6nYMig1NVVnz55VZGSkra1ixYoKDg5WUlKSC2eG4vzwww/y9PTUxx9/rHvuuceuLzk5Wc2aNZOHx/9eyxwREaGDBw8qIyPDqbV2VAPXRuXKlTV37lyFhITY2tzc3OTm5qasrCwlJyfbrZv017rs2LFDhmFox44dtrZC9erVU0BAgN3aXq4Grp1KlSpp6tSpslgskqSTJ09q0aJFCgwMVIMGDfjdNYGkpCQtX75cEydOtGs3+9oS7MqgtLQ0SVLNmjXt2mvUqGHrQ9nRokULzZw5U7fddluRvrS0NAUGBtq11ahRQ5J09OhRp9baUQ1cGxUrVtSDDz4oLy8vW9tnn32mQ4cOKSoq6pLrkp2drczMTB07dkyVK1eWt7d3kTGO1rawBq6PV199VZGRkVq/fr3i4uLk5+fH7+4NLisrS0OHDtWoUaOKrJHZ15ZgVwZlZ2dLkt0/KJLk7e2t3NxcV0wJpZSTk1PsOkp/XVjvzFo7qoHr47vvvtPw4cMVHR2t5s2bF7suhd/n5eUpOzu7SL/keG0vrIHr46mnnlJ8fLzatGmjAQMG6IcffuB39wY3ZswYhYWFqW3btkX6zL62Ho6H4Hrz8fGR9Nd/2Av/LP31l8XX19dV00Ip+Pj4FPkHuvCX3s/Pz6m1dlQD197GjRs1ZMgQNWnSRFOmTJH013/EL16Xwu99fX2LXTfJfm0d1cD10aBBA0lSXFycdu/erffff5/f3RvYRx99pOTkZK1du7bYfrOvLUfsyqDCw7/p6el27enp6QoICHDFlFBKgYGBxa6jJAUEBDi11o5q4Np6//33NXDgQD300EOaPXu27f/Ka9asWey6+Pn5qUKFCgoMDNSpU6eK/Mf/wrV1VAPXzsmTJ7V+/Xrl5+fb2tzd3dWgQQOlp6fzu3sDi4+P14kTJ9S8eXOFhYUpLCxMkvTaa6+pT58+pl9bgl0ZFBQUpPLlyysxMdHWlpWVpZSUFFmtVhfODCVltVq1Y8cOnT9/3ta2bds21atXT1WrVnVqrR3VwLWzdOlSjRs3Tt26ddO0adPsTr2Eh4dr+/btduO3bdumJk2ayN3dXU2bNlVBQYHtJgrpr+dpHTt2zLa2jmrg2snIyNDgwYO1detWW9u5c+eUkpKi+vXr87t7A5syZYo++eQTffTRR7YvSRo0aJDi4uLMv7auvi0XxZs2bZrRrFkzY+PGjXbP0MnLy3P11HAZw4YNs3vcSUZGhmG1Wo1hw4YZP/30kxEfH2+EhIQYCQkJtjGO1tqZGrj6fvnlF+Puu+82BgwYYKSnp9t9ZWVlGfv27TPuvvtuY/Lkycb+/fuN+fPnF3kG3eDBg40WLVoY27Ztsz3H7sK/H87UwLXTp08fIzo62ti+fbvx448/GoMHDzasVqvx+++/87trMhc+7sTsa0uwK6Py8/ONSZMmGREREUZoaKjxzDPPGIcPH3b1tODAxcHOMAxj9+7dRqdOnYxGjRoZDz30kPHee+/Z9Tuz1o5q4Op75513DIvFUuzXsGHDDMMwjK+//tpo06aN0ahRI6N169bG+vXr7WqcPXvWGDlypBEeHm6Eh4cbgwcPNk6ePGk3xlENXDtZWVnGa6+9Ztx3331G48aNjd69exv79u2z9fO7ax4XBjvDMPfauhkGD0sCAAAwAy7iAAAAMAmCHQAAgEkQ7AAAAEyCYAcAAGASBDsAAACTINgBAACYBMEOAADAJAh2wE2ge/fuatiwoe0rKChIYWFheuKJJ7RkyRK792Ve6B//+IcaNmyoBQsW2NoMw1CPHj3UqFEj7du3r9jtli1bpoYNG+rDDz+0te3YsUPPPvus7r33XjVq1EjNmzfXiBEjdPjw4RJ9loSEBLvPcqmvC/34448aOnSoHnjgAdu+Bw8erN27d5do34VmzpxZ7D5DQ0P1yCOPaPr06XY/00uNv/Cr8AXiF1qxYoUaNmyoZ5991umfRUhIiFq0aKFXX31VaWlplxx/4MCBYmtu2rSpyM8wMTFRDRs2tHvFkiR9+eWXeuqppxQeHq6QkBC1atVKcXFxOnHiRJG6zq5/4c/qYmfPntWsWbPUrl07hYaGqlmzZnryySe1fPnyIn9/Cz/nG2+8UexnvNQ+ADPwcPUEAFwfwcHBeu211yRJ58+f1x9//KFNmzZpwoQJSk5O1ptvvmn3ftLTp09r48aNslgsWr58uXr16iU3Nze5ubkpLi5O7dq106hRo7Rs2TK77dLS0jR58mRFRUWpS5cukqStW7eqT58+tn/4K1SooF9//VULFixQhw4dtHLlStWpU8epz9G8eXMtX77c9v1//vMfvfPOO3rrrbdUvXr1IuPXrFmjkSNHKjg4WC+99JJq1aqltLQ0rVq1Sl26dNHLL7+sXr16lepneuE8JCkzM1Pr1q3T7NmzlZ+fr5dffvmy4y904XtoC8XHx8tisWjTpk06evSo7eXkF7vws2dnZ+unn37S3LlztXHjRi1fvrzIz9bd3V2ffvqpnnvuuSK1Pvnkk0vO8UKrV6/W8OHD9eSTT6pnz57y9fXV/v37NXfuXH311VeKj49XpUqVJF35+h89elS9evVSZmamunfvrqZNmyo3N1fffvut4uLitG7dOs2aNUsVKlSw2+6DDz5Q69atFR4e7tRnAkzBxW++AHAdxMbGFnnVWaHFixcbFovFWLNmjV370qVLjcaNGxtbt241LBZLkfeXLlmyxLBYLMbChQvt2vv27Ws0a9bMSEtLs7X16NHDePLJJ4vs+9ixY0bjxo2NMWPGlPKTGUZ8fLxhsViKfeXeDz/8YNx9993GiBEjjPPnzxfpf+ONN4yGDRsaW7ZsKdE+Z8yYYVgslkv2d+rUyYiIiHB6fHH2799vWCwWY9OmTUbTpk2N6dOnFxlzuc9++PBhw2q1Gj179iwyvmvXrkbbtm2LbJObm2s0bdrU+Pvf/243323bthkWi8XYtm2bre3hhx82Bg8eXKTG3r17DYvFYrz77ru2tpKs/8U/q4KCAqNDhw7Ggw8+aBw5cqRIje+++85o1KiRMWTIkCKfMzw83GjVqpWRnZ1tt01p1gO4UXAqFrjJxcbGKiAgQMuWLbNrj4+PV2RkpCIiInT77bcX6Y+NjVV4eLj+9a9/6ffff5ckrVu3Tv/5z380evRoBQQE2MZmZGTIKObthTVq1NCoUaN03333XYNPJs2ePVt+fn4aNWqU3VHFQi+//LJq1qypt99++6rut3z58nJzc7uiGoVHvCIiIvTII49o1apVlzxlXpzatWurc+fO+vbbb/Xrr7/a9T366KP68ccfi5yO3bRpk9zc3PTAAw84rH+pNQ0KCtLw4cPVqFEjh2OdWf+vv/5a33//vW2tLhYWFqannnpKH3/8cZHTusOGDdOvv/6qadOmOfw8gFkQ7ICbnLu7uyIjI/X999/bgsNPP/2kPXv26PHHH5ckPf744/riiy+UkZFh287NzU3jx49XQUGBJk6cqNOnT2vixIn629/+pscee8xuH82bN9fOnTvVvXt3rVq1yu4f4I4dO6ply5ZX/XMVFBRoy5YtioyMlK+vb7FjvLy81LJlS+3YsUOZmZkl3kd+fr7tKy8vT8eOHdO7776rLVu26O9///tlx1/4VVBQUGTcxx9/rDZt2sjT01MxMTE6fvy4vvzyyxLNrzAw7dixo0h7pUqV9Omnn9q1f/LJJ2rVqpU8PT0d1m7evLnWr1+vAQMGaN26dTp27Jitr2fPnoqIiLAbW9r137x5s9zd3fXggw9eckzh37cvvvjCrj0iIkKdO3fWe++9V+RnAJgV19gBULVq1XTu3DmdOnVK1apVU3x8vG655Ra1aNFCkhQTE6OZM2dq1apVdhfy33777XrppZc0YcIEWzAaM2ZMkfovvPCCTp8+rVWrVmn79u2SpMDAQD344IPq2bOn7rjjjqv+mU6dOqUzZ86oVq1alx13++23yzAMHT16VJUrVy7RPu6+++4ibbfeeqsGDhyovn37OjVekrp166bRo0fbvt+0aZOOHz+uJ554QpIUHh6uunXratmyZYqOjnZ6foXX3R0/ftyu3cPDQy1bttS///1v23V22dnZ+uqrr/T22287FYLGjRungoICbdiwQRs3bpQk1alTRw8//LB69epld8T2Stb/t99+0y233KLy5ctfckzh9Xm//fZbkb6hQ4dq8+bNGjFihNasWSMfHx+Hnw24kXHEDoDtNJmbm5vOnTunjz/+WC1btlROTo6ysrLk7++vpk2basWKFUWOLvXo0UNhYWFKSkpSXFycbrnlliL1vby89Prrr+vrr79WXFyc2rZtq4KCAi1fvlzt2rXThg0brtlnc3T0qVy5cpJU7KlCR1atWqVVq1bpvffe08MPP6zy5ctr1KhRGjBgQLH7LRx/8VefPn3sxsXHx6tevXqqU6eOsrKylJWVpdatWxd7WvVyLlzXi118Ovarr76Sn5+f7r33XqdqV6hQQTNmzNDGjRs1evRoPfLII8rKytLChQvVunVr7dy50zb2StbfMAx5eFz+GMTl+v39/RUXF6eDBw9q+vTpTn024EbGETsAOnbsmHx8fHTLLbfoyy+/1IkTJ2yh42KbN2+2Oy3m7u6u++67Tzt37rzs6TLpryNIHTp0UIcOHSRJ27Zt08svv6wxY8aoZcuWxV4HV1qVK1eWn59fsUdxLlR4WvBSd5xeTkhIiO3P4eHh6tmzp1544QUtWrSo2DsxLxx/KSdOnNDXX3+tc+fOyWq1Fulfvnx5kbttL6XwcSeBgYFF+iIiIlS5cmXb3bGffPKJWrdubQu6zqpdu7a6deumbt26qaCgQBs3btQrr7yicePGKSEhwW5sada/Vq1a2rJli7Kzsy95Sr1wDW+99dZi+yMjI9W5c2ctWbJEjzzySIk+H3Cj4YgdcJPLz89XYmKimjRponLlyik+Pl633XablixZYve1ePFiVahQochNFI7s3r1b//d//6ctW7YU6YuIiNDTTz+tEydOlOoat8txc3PTQw89pM2bN+vs2bPFjjl//rw2btyoJk2aqEqVKle0P3d3d02YMEEeHh565ZVXin0unTM+/vhj5efn6+233y6yBlarVQkJCcrLy3Oq1rfffis3N7diQ6aHh4eio6P16aef6syZM9q0aVORayMv5bPPPlNERESRmy/c3d0VHR2t9u3b6+eff5Z05evfokUL5efn2073FqfwWsHCSweKM3ToUAUGBmr48OHKyclx+BmBGxXBDrjJLV++XMePH1eXLl10/Phxbd68WY899pjuvfdeu6+IiAi1bt1aX3/9td2F8o7UrVtX2dnZWrJkSZHTuJJ04MABVa9e/YqDVXH69eun7OxsjR49WufPny/SP23aNB06dOiSDwAuqVq1aql///46fPiw3n333VLVSEhIUGhoqFq2bFlkDTp16qSTJ0/q888/d1gnLS1NK1euVPPmzS95NPLRRx9VamqqFi5cqGrVqiksLMypOd555506deqUFi9eXGz/wYMHZbFYJF35+t93331q2rSp/vnPfxb7MOs9e/Zo3rx5evTRR1W3bt1Lzrl8+fJ64403dPDgwcs+TxC40XEqFrhJnDlzRrt27ZL01x2jmZmZ+uabb2zXOUVHR+vdd99Vfn7+JY/cPP7441q5cqVWrFihgQMHOrXfSpUqadiwYXrttdfUtWtXderUSbfddptOnz6tzz//XKtXr9aUKVOu+PEgxWnYsKEmTpyo4cOHq0uXLuratatq166t9PR0JSQkaMuWLRoyZIjDU8gl0bNnT61atUrvvvuuYmJi7G7eKPz5F6devXo6dOiQ9u3bp1dffbXYMa1atZK/v7+WLVtmt0Z79+613bGcnZ2tH3/8UYsWLZKPj4/dTRkXa9asmapXr645c+aoZ8+eTq/BHXfcob59+2rOnDk6cuSI2rVrp8DAQJ04cUJr1qzR1q1btXDhQklXvv7u7u6aOnWq+vbtqw4dOqhHjx5q0qSJCgoK9O233+qDDz5QcHCwxo4d63De9913nzp27KiVK1c69TmBGxHBDrhJpKSkqHPnzpL+Ok3p7+8vi8WiMWPGqGPHjpL+Olp055132o62XKxp06aqXbu2Vq5cqf79+zt9PdaTTz6p22+/XUuWLNG0adN06tQp+fv7q3Hjxlq8eLHTF+yXxmOPPaaGDRtq0aJFmjFjho4fP64qVaooPDxcH374oUJDQ6/q/ry8vDRixAj169dP//znPzVjxgxbX+HPvzhvv/22Nm/erHLlyql169bFjvH19dUjjzyihIQE26lOSXr++edtf/b09FStWrXUqlUr9e3bt9i3cRRyd3fXI488ovfff9/p07CFBg8erLvuuksrV67UG2+8oTNnzqhixYoKDw/XqlWrFBQUZBt7petfs2ZNLV++XB9++KHWrVun+fPnq1y5cqpfv75eeeUVdezY0em/i6+88oq++eYbHT16tESfF7hRuBmluRUMAAAAZQ5H7ACUCYZhFHsd3MXKlSt3TU7bunr/AHA1EOwAlAnbt29Xjx49HI6bMGGC7cG9Zto/AFwNnIoFUCacOXOmyOMzilO7du0SvyHiRtg/AFwNBDsAAACT4Dl2AAAAJkGwAwAAMAmCHQAAgEkQ7AAAAEyCYAcAAGASBDsAAACTINgBAACYBMEOAADAJP4f2r4mrJ80jT8AAAAASUVORK5CYII=",
+      "text/plain": [
+       "<Figure size 700x350 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "sns.set(rc={'figure.figsize':(7,3.5), 'axes.grid':True})\n",
+    "sns.set_style(\"whitegrid\", {'axes.grid' : False})\n",
+    "fig = sns.histplot(admissions_df['DAYS_TO_READMISSION'], kde=False, bins=15)\n",
+    "fig = fig.get_figure()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "120.0\n",
+      "507.0\n",
+      "23.0\n",
+      "408.8103342398456\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "4107.0"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(admissions_df['DAYS_TO_READMISSION'].quantile(0.5))\n",
+    "print(admissions_df['DAYS_TO_READMISSION'].quantile(0.75))\n",
+    "print(admissions_df['DAYS_TO_READMISSION'].quantile(0.25))\n",
+    "print(admissions_df['DAYS_TO_READMISSION'].mean())\n",
+    "admissions_df['DAYS_TO_READMISSION'].max()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Notes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ROW_ID</th>\n",
+       "      <th>SUBJECT_ID</th>\n",
+       "      <th>HADM_ID</th>\n",
+       "      <th>CHARTDATE</th>\n",
+       "      <th>CHARTTIME</th>\n",
+       "      <th>STORETIME</th>\n",
+       "      <th>CATEGORY</th>\n",
+       "      <th>DESCRIPTION</th>\n",
+       "      <th>CGID</th>\n",
+       "      <th>ISERROR</th>\n",
+       "      <th>TEXT</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>174</td>\n",
+       "      <td>22532</td>\n",
+       "      <td>167853.0</td>\n",
+       "      <td>2151-08-04</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Discharge summary</td>\n",
+       "      <td>Report</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Admission Date:  [**2151-7-16**]       Dischar...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>175</td>\n",
+       "      <td>13702</td>\n",
+       "      <td>107527.0</td>\n",
+       "      <td>2118-06-14</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Discharge summary</td>\n",
+       "      <td>Report</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Admission Date:  [**2118-6-2**]       Discharg...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>176</td>\n",
+       "      <td>13702</td>\n",
+       "      <td>167118.0</td>\n",
+       "      <td>2119-05-25</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Discharge summary</td>\n",
+       "      <td>Report</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Admission Date:  [**2119-5-4**]              D...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>177</td>\n",
+       "      <td>13702</td>\n",
+       "      <td>196489.0</td>\n",
+       "      <td>2124-08-18</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Discharge summary</td>\n",
+       "      <td>Report</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Admission Date:  [**2124-7-21**]              ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>178</td>\n",
+       "      <td>26880</td>\n",
+       "      <td>135453.0</td>\n",
+       "      <td>2162-03-25</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Discharge summary</td>\n",
+       "      <td>Report</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Admission Date:  [**2162-3-3**]              D...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   ROW_ID  SUBJECT_ID   HADM_ID   CHARTDATE CHARTTIME STORETIME  \\\n",
+       "0     174       22532  167853.0  2151-08-04       NaN       NaN   \n",
+       "1     175       13702  107527.0  2118-06-14       NaN       NaN   \n",
+       "2     176       13702  167118.0  2119-05-25       NaN       NaN   \n",
+       "3     177       13702  196489.0  2124-08-18       NaN       NaN   \n",
+       "4     178       26880  135453.0  2162-03-25       NaN       NaN   \n",
+       "\n",
+       "            CATEGORY DESCRIPTION  CGID  ISERROR  \\\n",
+       "0  Discharge summary      Report   NaN      NaN   \n",
+       "1  Discharge summary      Report   NaN      NaN   \n",
+       "2  Discharge summary      Report   NaN      NaN   \n",
+       "3  Discharge summary      Report   NaN      NaN   \n",
+       "4  Discharge summary      Report   NaN      NaN   \n",
+       "\n",
+       "                                                TEXT  \n",
+       "0  Admission Date:  [**2151-7-16**]       Dischar...  \n",
+       "1  Admission Date:  [**2118-6-2**]       Discharg...  \n",
+       "2  Admission Date:  [**2119-5-4**]              D...  \n",
+       "3  Admission Date:  [**2124-7-21**]              ...  \n",
+       "4  Admission Date:  [**2162-3-3**]              D...  "
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "notes_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(2083180, 11)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "notes_df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['Discharge summary', 'Echo', 'ECG', 'Nursing', 'Physician ',\n",
+       "       'Rehab Services', 'Case Management ', 'Respiratory ', 'Nutrition',\n",
+       "       'General', 'Social Work', 'Pharmacy', 'Consult', 'Radiology',\n",
+       "       'Nursing/other'], dtype=object)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "notes_df['CATEGORY'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(59652, 11)"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "discharge_notes = notes_df[notes_df['CATEGORY'] == \"Discharge summary\"]\n",
+    "discharge_notes.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There are 6926 admissions with more than one discharge note (HADM_ID - ID of Admissions)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "6926"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "discharge_notes.duplicated(['HADM_ID']).sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Take the last row per admission"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "discharge_notes_ordered = discharge_notes.groupby(['SUBJECT_ID', 'HADM_ID']).nth(-1).reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "discharge_notes_ordered.duplicated(['HADM_ID']).sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(52726, 11)"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "discharge_notes_ordered.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Merge Notes and Admissions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "admissions_notes = pd.merge(\n",
+    "    admissions_df[['SUBJECT_ID','HADM_ID','ADMITTIME','DISCHTIME','DAYS_TO_READMISSION','NEXT_ADMITTIME','ADMISSION_TYPE','DEATHTIME']],\n",
+    "    discharge_notes_ordered[['SUBJECT_ID', 'HADM_ID', 'TEXT']], \n",
+    "    on = ['SUBJECT_ID', 'HADM_ID'], how='left'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>SUBJECT_ID</th>\n",
+       "      <th>HADM_ID</th>\n",
+       "      <th>ADMITTIME</th>\n",
+       "      <th>DISCHTIME</th>\n",
+       "      <th>DAYS_TO_READMISSION</th>\n",
+       "      <th>NEXT_ADMITTIME</th>\n",
+       "      <th>ADMISSION_TYPE</th>\n",
+       "      <th>DEATHTIME</th>\n",
+       "      <th>TEXT</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2</td>\n",
+       "      <td>163353</td>\n",
+       "      <td>2138-07-17 19:04:00</td>\n",
+       "      <td>2138-07-21 15:48:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>NEWBORN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>3</td>\n",
+       "      <td>145834</td>\n",
+       "      <td>2101-10-20 19:08:00</td>\n",
+       "      <td>2101-10-31 13:58:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2101-10-20**]     Discharg...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>4</td>\n",
+       "      <td>185777</td>\n",
+       "      <td>2191-03-16 00:28:00</td>\n",
+       "      <td>2191-03-23 18:41:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2191-3-16**]     Discharge...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>5</td>\n",
+       "      <td>178980</td>\n",
+       "      <td>2103-02-02 04:31:00</td>\n",
+       "      <td>2103-02-04 12:15:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>NEWBORN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>6</td>\n",
+       "      <td>107064</td>\n",
+       "      <td>2175-05-30 07:15:00</td>\n",
+       "      <td>2175-06-15 16:00:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date: [**2175-5-30**]        Dischar...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>58971</th>\n",
+       "      <td>99985</td>\n",
+       "      <td>176670</td>\n",
+       "      <td>2181-01-27 02:47:00</td>\n",
+       "      <td>2181-02-12 17:05:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2181-1-27**]              ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>58972</th>\n",
+       "      <td>99991</td>\n",
+       "      <td>151118</td>\n",
+       "      <td>2184-12-24 08:30:00</td>\n",
+       "      <td>2185-01-05 12:15:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2184-12-24**]             ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>58973</th>\n",
+       "      <td>99992</td>\n",
+       "      <td>197084</td>\n",
+       "      <td>2144-07-25 18:03:00</td>\n",
+       "      <td>2144-07-28 17:56:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2144-7-25**]              ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>58974</th>\n",
+       "      <td>99995</td>\n",
+       "      <td>137810</td>\n",
+       "      <td>2147-02-08 08:00:00</td>\n",
+       "      <td>2147-02-11 13:15:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2147-2-8**]              D...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>58975</th>\n",
+       "      <td>99999</td>\n",
+       "      <td>113369</td>\n",
+       "      <td>2117-12-30 07:15:00</td>\n",
+       "      <td>2118-01-04 16:30:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2117-12-30**]             ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>58976 rows × 9 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       SUBJECT_ID  HADM_ID           ADMITTIME           DISCHTIME  \\\n",
+       "0               2   163353 2138-07-17 19:04:00 2138-07-21 15:48:00   \n",
+       "1               3   145834 2101-10-20 19:08:00 2101-10-31 13:58:00   \n",
+       "2               4   185777 2191-03-16 00:28:00 2191-03-23 18:41:00   \n",
+       "3               5   178980 2103-02-02 04:31:00 2103-02-04 12:15:00   \n",
+       "4               6   107064 2175-05-30 07:15:00 2175-06-15 16:00:00   \n",
+       "...           ...      ...                 ...                 ...   \n",
+       "58971       99985   176670 2181-01-27 02:47:00 2181-02-12 17:05:00   \n",
+       "58972       99991   151118 2184-12-24 08:30:00 2185-01-05 12:15:00   \n",
+       "58973       99992   197084 2144-07-25 18:03:00 2144-07-28 17:56:00   \n",
+       "58974       99995   137810 2147-02-08 08:00:00 2147-02-11 13:15:00   \n",
+       "58975       99999   113369 2117-12-30 07:15:00 2118-01-04 16:30:00   \n",
+       "\n",
+       "       DAYS_TO_READMISSION NEXT_ADMITTIME ADMISSION_TYPE DEATHTIME  \\\n",
+       "0                      NaN            NaT        NEWBORN       NaT   \n",
+       "1                      NaN            NaT      EMERGENCY       NaT   \n",
+       "2                      NaN            NaT      EMERGENCY       NaT   \n",
+       "3                      NaN            NaT        NEWBORN       NaT   \n",
+       "4                      NaN            NaT       ELECTIVE       NaT   \n",
+       "...                    ...            ...            ...       ...   \n",
+       "58971                  NaN            NaT      EMERGENCY       NaT   \n",
+       "58972                  NaN            NaT       ELECTIVE       NaT   \n",
+       "58973                  NaN            NaT      EMERGENCY       NaT   \n",
+       "58974                  NaN            NaT       ELECTIVE       NaT   \n",
+       "58975                  NaN            NaT       ELECTIVE       NaT   \n",
+       "\n",
+       "                                                    TEXT  \n",
+       "0                                                    NaN  \n",
+       "1      Admission Date:  [**2101-10-20**]     Discharg...  \n",
+       "2      Admission Date:  [**2191-3-16**]     Discharge...  \n",
+       "3                                                    NaN  \n",
+       "4      Admission Date: [**2175-5-30**]        Dischar...  \n",
+       "...                                                  ...  \n",
+       "58971  Admission Date:  [**2181-1-27**]              ...  \n",
+       "58972  Admission Date:  [**2184-12-24**]             ...  \n",
+       "58973  Admission Date:  [**2144-7-25**]              ...  \n",
+       "58974  Admission Date:  [**2147-2-8**]              D...  \n",
+       "58975  Admission Date:  [**2117-12-30**]             ...  \n",
+       "\n",
+       "[58976 rows x 9 columns]"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "admissions_notes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "10% of admissions without discharge notes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.1059753119913185"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sum(admissions_notes['TEXT'].isnull()) / len(admissions_notes)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "53% of NEWBORN are missing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ADMISSION_TYPE\n",
+       "ELECTIVE     0.048663\n",
+       "EMERGENCY    0.037983\n",
+       "NEWBORN      0.536691\n",
+       "URGENT       0.042665\n",
+       "dtype: float64"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "admissions_notes.groupby('ADMISSION_TYPE').apply(lambda g: g.TEXT.isnull().sum())/admissions_notes.groupby('ADMISSION_TYPE').size()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Remove NEWBORN admissions and create the target variable"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adm_notes = admissions_notes[admissions_notes['ADMISSION_TYPE'] != 'NEWBORN'].copy()\n",
+    "adm_notes['READM_WITHIN_30'] = (adm_notes['DAYS_TO_READMISSION'] < 30).astype('int')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3004"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sum(adm_notes['READM_WITHIN_30'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(51113, 10)"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adm_notes.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Exlude patients that died during the admission"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rows_not_death = adm_notes['DEATHTIME'].isnull()\n",
+    "df_adm_notes_not_death = adm_notes.loc[rows_not_death].copy()\n",
+    "df_adm_notes_not_death = df_adm_notes_not_death.sample(n = len(df_adm_notes_not_death))\n",
+    "df_adm_notes_not_death = df_adm_notes_not_death.reset_index(drop = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2963"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sum(df_adm_notes_not_death['READM_WITHIN_30'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "45321"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(df_adm_notes_not_death['READM_WITHIN_30'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data Creation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Exlude patients that died during the admission"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adm_notes = adm_notes.loc[adm_notes['DEATHTIME'].isnull()]\n",
+    "adm_notes = adm_notes.sample(n = len(adm_notes))\n",
+    "adm_notes = adm_notes.reset_index(drop = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>SUBJECT_ID</th>\n",
+       "      <th>HADM_ID</th>\n",
+       "      <th>ADMITTIME</th>\n",
+       "      <th>DISCHTIME</th>\n",
+       "      <th>DAYS_TO_READMISSION</th>\n",
+       "      <th>NEXT_ADMITTIME</th>\n",
+       "      <th>ADMISSION_TYPE</th>\n",
+       "      <th>DEATHTIME</th>\n",
+       "      <th>TEXT</th>\n",
+       "      <th>READM_WITHIN_30</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>6090</td>\n",
+       "      <td>175043</td>\n",
+       "      <td>2170-05-03 07:15:00</td>\n",
+       "      <td>2170-05-06 13:40:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2170-5-3**]              D...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>27901</td>\n",
+       "      <td>189210</td>\n",
+       "      <td>2101-06-21 07:15:00</td>\n",
+       "      <td>2101-07-13 15:00:00</td>\n",
+       "      <td>50.0</td>\n",
+       "      <td>2101-09-01 20:44:00</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2101-6-21**]              ...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>69531</td>\n",
+       "      <td>102759</td>\n",
+       "      <td>2156-08-18 23:41:00</td>\n",
+       "      <td>2156-08-26 16:45:00</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>2156-09-03 21:11:00</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2156-8-18**]              ...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>8620</td>\n",
+       "      <td>148993</td>\n",
+       "      <td>2190-02-05 17:13:00</td>\n",
+       "      <td>2190-02-09 17:53:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2190-2-5**]     Discharge ...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>24226</td>\n",
+       "      <td>118785</td>\n",
+       "      <td>2113-04-04 07:30:00</td>\n",
+       "      <td>2113-04-14 11:20:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Name:  [**Known lastname 10030**],[**Known fir...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45316</th>\n",
+       "      <td>17882</td>\n",
+       "      <td>157780</td>\n",
+       "      <td>2146-05-01 16:33:00</td>\n",
+       "      <td>2146-05-09 16:20:00</td>\n",
+       "      <td>412.0</td>\n",
+       "      <td>2147-06-25 17:42:00</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2146-5-1**]              D...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45317</th>\n",
+       "      <td>2184</td>\n",
+       "      <td>171742</td>\n",
+       "      <td>2154-04-21 19:25:00</td>\n",
+       "      <td>2154-04-25 11:49:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2154-4-21**]     Discharge...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45318</th>\n",
+       "      <td>28240</td>\n",
+       "      <td>151747</td>\n",
+       "      <td>2195-06-21 07:27:00</td>\n",
+       "      <td>2195-06-26 14:33:00</td>\n",
+       "      <td>154.0</td>\n",
+       "      <td>2195-11-28 00:15:00</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2195-6-21**]              ...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45319</th>\n",
+       "      <td>25201</td>\n",
+       "      <td>124241</td>\n",
+       "      <td>2149-06-02 08:00:00</td>\n",
+       "      <td>2149-06-11 13:00:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2149-6-2**]              D...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45320</th>\n",
+       "      <td>20855</td>\n",
+       "      <td>108604</td>\n",
+       "      <td>2161-07-31 17:00:00</td>\n",
+       "      <td>2161-08-20 16:00:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2161-7-31**]       Dischar...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>45321 rows × 10 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       SUBJECT_ID  HADM_ID           ADMITTIME           DISCHTIME  \\\n",
+       "0            6090   175043 2170-05-03 07:15:00 2170-05-06 13:40:00   \n",
+       "1           27901   189210 2101-06-21 07:15:00 2101-07-13 15:00:00   \n",
+       "2           69531   102759 2156-08-18 23:41:00 2156-08-26 16:45:00   \n",
+       "3            8620   148993 2190-02-05 17:13:00 2190-02-09 17:53:00   \n",
+       "4           24226   118785 2113-04-04 07:30:00 2113-04-14 11:20:00   \n",
+       "...           ...      ...                 ...                 ...   \n",
+       "45316       17882   157780 2146-05-01 16:33:00 2146-05-09 16:20:00   \n",
+       "45317        2184   171742 2154-04-21 19:25:00 2154-04-25 11:49:00   \n",
+       "45318       28240   151747 2195-06-21 07:27:00 2195-06-26 14:33:00   \n",
+       "45319       25201   124241 2149-06-02 08:00:00 2149-06-11 13:00:00   \n",
+       "45320       20855   108604 2161-07-31 17:00:00 2161-08-20 16:00:00   \n",
+       "\n",
+       "       DAYS_TO_READMISSION      NEXT_ADMITTIME ADMISSION_TYPE DEATHTIME  \\\n",
+       "0                      NaN                 NaT       ELECTIVE       NaT   \n",
+       "1                     50.0 2101-09-01 20:44:00       ELECTIVE       NaT   \n",
+       "2                      8.0 2156-09-03 21:11:00      EMERGENCY       NaT   \n",
+       "3                      NaN                 NaT       ELECTIVE       NaT   \n",
+       "4                      NaN                 NaT       ELECTIVE       NaT   \n",
+       "...                    ...                 ...            ...       ...   \n",
+       "45316                412.0 2147-06-25 17:42:00      EMERGENCY       NaT   \n",
+       "45317                  NaN                 NaT      EMERGENCY       NaT   \n",
+       "45318                154.0 2195-11-28 00:15:00      EMERGENCY       NaT   \n",
+       "45319                  NaN                 NaT       ELECTIVE       NaT   \n",
+       "45320                  NaN                 NaT      EMERGENCY       NaT   \n",
+       "\n",
+       "                                                    TEXT  READM_WITHIN_30  \n",
+       "0      Admission Date:  [**2170-5-3**]              D...                0  \n",
+       "1      Admission Date:  [**2101-6-21**]              ...                0  \n",
+       "2      Admission Date:  [**2156-8-18**]              ...                1  \n",
+       "3      Admission Date:  [**2190-2-5**]     Discharge ...                0  \n",
+       "4      Name:  [**Known lastname 10030**],[**Known fir...                0  \n",
+       "...                                                  ...              ...  \n",
+       "45316  Admission Date:  [**2146-5-1**]              D...                0  \n",
+       "45317  Admission Date:  [**2154-4-21**]     Discharge...                0  \n",
+       "45318  Admission Date:  [**2195-6-21**]              ...                0  \n",
+       "45319  Admission Date:  [**2149-6-2**]              D...                0  \n",
+       "45320  Admission Date:  [**2161-7-31**]       Dischar...                0  \n",
+       "\n",
+       "[45321 rows x 10 columns]"
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adm_notes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adm_notes.to_csv(DIR + 'readmission.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data Cleaning"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Natural Language"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
+      ]
+     },
+     "execution_count": 77,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "string.punctuation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_text(texts):\n",
+    "    texts = texts.fillna(' ')\n",
+    "    texts = texts.str.replace('\\n',' ')\n",
+    "    texts = texts.str.replace('\\r',' ')\n",
+    "\n",
+    "    table = str.maketrans('', '', string.punctuation + '0123456789')\n",
+    "    texts = [text.lower().translate(table) for text in texts]\n",
+    "\n",
+    "    return texts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adm_notes['TEXT'] = clean_text(adm_notes['TEXT'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stop_words = stopwords.words('english')\n",
+    "stop_words = stop_words + ['patient', 'date', 'admission', 'discharge', 'lastname', 'firstname', 'sex']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "porter = PorterStemmer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_stem(text):\n",
+    "    words = word_tokenize(text)\n",
+    "    words = [word for word in words if word not in stop_words]\n",
+    "    words = [porter.stem(word) for word in words]\n",
+    "    return words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2963\n",
+      "45321\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(sum(adm_notes['READM_WITHIN_30'] == 1) )\n",
+    "print(len(adm_notes['READM_WITHIN_30']))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}