--- a
+++ b/data-exploration-cleaning.ipynb
@@ -0,0 +1,1653 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loading the data and required libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import re\n",
+    "import datetime\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "import string\n",
+    "import nltk\n",
+    "from nltk import word_tokenize\n",
+    "from nltk.stem.porter import PorterStemmer\n",
+    "from nltk.corpus import stopwords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DIR = \"E:/Coding/Summer 2023/data/\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "notes_df = pd.read_csv(DIR + \"NOTEEVENTS.csv\", low_memory=False, memory_map=True)\n",
+    "admissions_df = pd.read_csv(DIR + \"ADMISSIONS.csv\", low_memory=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data Exploration\n",
+    "## Admissions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ROW_ID</th>\n",
+       "      <th>SUBJECT_ID</th>\n",
+       "      <th>HADM_ID</th>\n",
+       "      <th>ADMITTIME</th>\n",
+       "      <th>DISCHTIME</th>\n",
+       "      <th>DEATHTIME</th>\n",
+       "      <th>ADMISSION_TYPE</th>\n",
+       "      <th>ADMISSION_LOCATION</th>\n",
+       "      <th>DISCHARGE_LOCATION</th>\n",
+       "      <th>INSURANCE</th>\n",
+       "      <th>LANGUAGE</th>\n",
+       "      <th>RELIGION</th>\n",
+       "      <th>MARITAL_STATUS</th>\n",
+       "      <th>ETHNICITY</th>\n",
+       "      <th>EDREGTIME</th>\n",
+       "      <th>EDOUTTIME</th>\n",
+       "      <th>DIAGNOSIS</th>\n",
+       "      <th>HOSPITAL_EXPIRE_FLAG</th>\n",
+       "      <th>HAS_CHARTEVENTS_DATA</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>21</td>\n",
+       "      <td>22</td>\n",
+       "      <td>165315</td>\n",
+       "      <td>2196-04-09 12:26:00</td>\n",
+       "      <td>2196-04-10 15:54:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>EMERGENCY ROOM ADMIT</td>\n",
+       "      <td>DISC-TRAN CANCER/CHLDRN H</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>UNOBTAINABLE</td>\n",
+       "      <td>MARRIED</td>\n",
+       "      <td>WHITE</td>\n",
+       "      <td>2196-04-09 10:06:00</td>\n",
+       "      <td>2196-04-09 13:24:00</td>\n",
+       "      <td>BENZODIAZEPINE OVERDOSE</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>22</td>\n",
+       "      <td>23</td>\n",
+       "      <td>152223</td>\n",
+       "      <td>2153-09-03 07:15:00</td>\n",
+       "      <td>2153-09-08 19:10:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>PHYS REFERRAL/NORMAL DELI</td>\n",
+       "      <td>HOME HEALTH CARE</td>\n",
+       "      <td>Medicare</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>CATHOLIC</td>\n",
+       "      <td>MARRIED</td>\n",
+       "      <td>WHITE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>CORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>23</td>\n",
+       "      <td>23</td>\n",
+       "      <td>124321</td>\n",
+       "      <td>2157-10-18 19:34:00</td>\n",
+       "      <td>2157-10-25 14:00:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>TRANSFER FROM HOSP/EXTRAM</td>\n",
+       "      <td>HOME HEALTH CARE</td>\n",
+       "      <td>Medicare</td>\n",
+       "      <td>ENGL</td>\n",
+       "      <td>CATHOLIC</td>\n",
+       "      <td>MARRIED</td>\n",
+       "      <td>WHITE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>BRAIN MASS</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>24</td>\n",
+       "      <td>24</td>\n",
+       "      <td>161859</td>\n",
+       "      <td>2139-06-06 16:14:00</td>\n",
+       "      <td>2139-06-09 12:48:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>TRANSFER FROM HOSP/EXTRAM</td>\n",
+       "      <td>HOME</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>PROTESTANT QUAKER</td>\n",
+       "      <td>SINGLE</td>\n",
+       "      <td>WHITE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>INTERIOR MYOCARDIAL INFARCTION</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>25</td>\n",
+       "      <td>25</td>\n",
+       "      <td>129635</td>\n",
+       "      <td>2160-11-02 02:06:00</td>\n",
+       "      <td>2160-11-05 14:55:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>EMERGENCY ROOM ADMIT</td>\n",
+       "      <td>HOME</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>UNOBTAINABLE</td>\n",
+       "      <td>MARRIED</td>\n",
+       "      <td>WHITE</td>\n",
+       "      <td>2160-11-02 01:01:00</td>\n",
+       "      <td>2160-11-02 04:27:00</td>\n",
+       "      <td>ACUTE CORONARY SYNDROME</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   ROW_ID  SUBJECT_ID  HADM_ID            ADMITTIME            DISCHTIME  \\\n",
+       "0      21          22   165315  2196-04-09 12:26:00  2196-04-10 15:54:00   \n",
+       "1      22          23   152223  2153-09-03 07:15:00  2153-09-08 19:10:00   \n",
+       "2      23          23   124321  2157-10-18 19:34:00  2157-10-25 14:00:00   \n",
+       "3      24          24   161859  2139-06-06 16:14:00  2139-06-09 12:48:00   \n",
+       "4      25          25   129635  2160-11-02 02:06:00  2160-11-05 14:55:00   \n",
+       "\n",
+       "  DEATHTIME ADMISSION_TYPE         ADMISSION_LOCATION  \\\n",
+       "0       NaN      EMERGENCY       EMERGENCY ROOM ADMIT   \n",
+       "1       NaN       ELECTIVE  PHYS REFERRAL/NORMAL DELI   \n",
+       "2       NaN      EMERGENCY  TRANSFER FROM HOSP/EXTRAM   \n",
+       "3       NaN      EMERGENCY  TRANSFER FROM HOSP/EXTRAM   \n",
+       "4       NaN      EMERGENCY       EMERGENCY ROOM ADMIT   \n",
+       "\n",
+       "          DISCHARGE_LOCATION INSURANCE LANGUAGE           RELIGION  \\\n",
+       "0  DISC-TRAN CANCER/CHLDRN H   Private      NaN       UNOBTAINABLE   \n",
+       "1           HOME HEALTH CARE  Medicare      NaN           CATHOLIC   \n",
+       "2           HOME HEALTH CARE  Medicare     ENGL           CATHOLIC   \n",
+       "3                       HOME   Private      NaN  PROTESTANT QUAKER   \n",
+       "4                       HOME   Private      NaN       UNOBTAINABLE   \n",
+       "\n",
+       "  MARITAL_STATUS ETHNICITY            EDREGTIME            EDOUTTIME  \\\n",
+       "0        MARRIED     WHITE  2196-04-09 10:06:00  2196-04-09 13:24:00   \n",
+       "1        MARRIED     WHITE                  NaN                  NaN   \n",
+       "2        MARRIED     WHITE                  NaN                  NaN   \n",
+       "3         SINGLE     WHITE                  NaN                  NaN   \n",
+       "4        MARRIED     WHITE  2160-11-02 01:01:00  2160-11-02 04:27:00   \n",
+       "\n",
+       "                                           DIAGNOSIS  HOSPITAL_EXPIRE_FLAG  \\\n",
+       "0                            BENZODIAZEPINE OVERDOSE                     0   \n",
+       "1  CORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS...                     0   \n",
+       "2                                         BRAIN MASS                     0   \n",
+       "3                     INTERIOR MYOCARDIAL INFARCTION                     0   \n",
+       "4                            ACUTE CORONARY SYNDROME                     0   \n",
+       "\n",
+       "   HAS_CHARTEVENTS_DATA  \n",
+       "0                     1  \n",
+       "1                     1  \n",
+       "2                     1  \n",
+       "3                     1  \n",
+       "4                     1  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "admissions_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(58976, 19)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "admissions_df.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Types of admissions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['EMERGENCY', 'ELECTIVE', 'NEWBORN', 'URGENT'], dtype=object)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "admissions_df['ADMISSION_TYPE'].unique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check for missing values on the admission times"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sum(admissions_df['ADMITTIME'].isnull())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Conversion of times to datetime type"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "admissions_df['ADMITTIME'] = admissions_df['ADMITTIME'].astype('datetime64[ns]')\n",
+    "admissions_df['DISCHTIME'] = admissions_df['DISCHTIME'].astype('datetime64[ns]')\n",
+    "admissions_df['DEATHTIME'] = admissions_df['DEATHTIME'].astype('datetime64[ns]')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Sort by subject and admission type and reset the data frame index."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "admissions_df = admissions_df.sort_values(['SUBJECT_ID', 'ADMITTIME'])\n",
+    "admissions_df = admissions_df.reset_index(drop = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "admissions_df['NEXT_ADMITTIME'] = admissions_df.groupby('SUBJECT_ID').ADMITTIME.shift(-1)\n",
+    "admissions_df['NEXT_ADMISSION_TYPE'] = admissions_df.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "admissions_df.loc[admissions_df['NEXT_ADMISSION_TYPE'] == 'ELECTIVE', 'NEXT_ADMITTIME'] = pd.NaT\n",
+    "admissions_df.loc[admissions_df['NEXT_ADMISSION_TYPE'] == 'ELECTIVE', 'NEXT_ADMISSION_TYPE'] = np.NaN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Fill NA's with the next valid value. Previously sorted."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "admissions_df[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']] = admissions_df.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']].fillna(method = 'bfill')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Obtain days to readmission: from discharge to next readmission"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "admissions_df['DAYS_TO_READMISSION'] = (admissions_df['NEXT_ADMITTIME'] - admissions_df['DISCHTIME']).dt.days"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Number of readmissions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "11399"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sum(admissions_df['DAYS_TO_READMISSION'].notnull())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Distribution of days to readmission"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 700x350 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "sns.set(rc={'figure.figsize':(7,3.5), 'axes.grid':True})\n",
+    "sns.set_style(\"whitegrid\", {'axes.grid' : False})\n",
+    "fig = sns.histplot(admissions_df['DAYS_TO_READMISSION'], kde=False, bins=15)\n",
+    "fig = fig.get_figure()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "120.0\n",
+      "507.0\n",
+      "23.0\n",
+      "408.8103342398456\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "4107.0"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(admissions_df['DAYS_TO_READMISSION'].quantile(0.5))\n",
+    "print(admissions_df['DAYS_TO_READMISSION'].quantile(0.75))\n",
+    "print(admissions_df['DAYS_TO_READMISSION'].quantile(0.25))\n",
+    "print(admissions_df['DAYS_TO_READMISSION'].mean())\n",
+    "admissions_df['DAYS_TO_READMISSION'].max()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Notes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ROW_ID</th>\n",
+       "      <th>SUBJECT_ID</th>\n",
+       "      <th>HADM_ID</th>\n",
+       "      <th>CHARTDATE</th>\n",
+       "      <th>CHARTTIME</th>\n",
+       "      <th>STORETIME</th>\n",
+       "      <th>CATEGORY</th>\n",
+       "      <th>DESCRIPTION</th>\n",
+       "      <th>CGID</th>\n",
+       "      <th>ISERROR</th>\n",
+       "      <th>TEXT</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>174</td>\n",
+       "      <td>22532</td>\n",
+       "      <td>167853.0</td>\n",
+       "      <td>2151-08-04</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Discharge summary</td>\n",
+       "      <td>Report</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Admission Date:  [**2151-7-16**]       Dischar...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>175</td>\n",
+       "      <td>13702</td>\n",
+       "      <td>107527.0</td>\n",
+       "      <td>2118-06-14</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Discharge summary</td>\n",
+       "      <td>Report</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Admission Date:  [**2118-6-2**]       Discharg...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>176</td>\n",
+       "      <td>13702</td>\n",
+       "      <td>167118.0</td>\n",
+       "      <td>2119-05-25</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Discharge summary</td>\n",
+       "      <td>Report</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Admission Date:  [**2119-5-4**]              D...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>177</td>\n",
+       "      <td>13702</td>\n",
+       "      <td>196489.0</td>\n",
+       "      <td>2124-08-18</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Discharge summary</td>\n",
+       "      <td>Report</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Admission Date:  [**2124-7-21**]              ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>178</td>\n",
+       "      <td>26880</td>\n",
+       "      <td>135453.0</td>\n",
+       "      <td>2162-03-25</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Discharge summary</td>\n",
+       "      <td>Report</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Admission Date:  [**2162-3-3**]              D...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   ROW_ID  SUBJECT_ID   HADM_ID   CHARTDATE CHARTTIME STORETIME  \\\n",
+       "0     174       22532  167853.0  2151-08-04       NaN       NaN   \n",
+       "1     175       13702  107527.0  2118-06-14       NaN       NaN   \n",
+       "2     176       13702  167118.0  2119-05-25       NaN       NaN   \n",
+       "3     177       13702  196489.0  2124-08-18       NaN       NaN   \n",
+       "4     178       26880  135453.0  2162-03-25       NaN       NaN   \n",
+       "\n",
+       "            CATEGORY DESCRIPTION  CGID  ISERROR  \\\n",
+       "0  Discharge summary      Report   NaN      NaN   \n",
+       "1  Discharge summary      Report   NaN      NaN   \n",
+       "2  Discharge summary      Report   NaN      NaN   \n",
+       "3  Discharge summary      Report   NaN      NaN   \n",
+       "4  Discharge summary      Report   NaN      NaN   \n",
+       "\n",
+       "                                                TEXT  \n",
+       "0  Admission Date:  [**2151-7-16**]       Dischar...  \n",
+       "1  Admission Date:  [**2118-6-2**]       Discharg...  \n",
+       "2  Admission Date:  [**2119-5-4**]              D...  \n",
+       "3  Admission Date:  [**2124-7-21**]              ...  \n",
+       "4  Admission Date:  [**2162-3-3**]              D...  "
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "notes_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(2083180, 11)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "notes_df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['Discharge summary', 'Echo', 'ECG', 'Nursing', 'Physician ',\n",
+       "       'Rehab Services', 'Case Management ', 'Respiratory ', 'Nutrition',\n",
+       "       'General', 'Social Work', 'Pharmacy', 'Consult', 'Radiology',\n",
+       "       'Nursing/other'], dtype=object)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "notes_df['CATEGORY'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(59652, 11)"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "discharge_notes = notes_df[notes_df['CATEGORY'] == \"Discharge summary\"]\n",
+    "discharge_notes.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There are 6926 admissions with more than one discharge note (HADM_ID - ID of Admissions)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "6926"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "discharge_notes.duplicated(['HADM_ID']).sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Take the last row per admission"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "discharge_notes_ordered = discharge_notes.groupby(['SUBJECT_ID', 'HADM_ID']).nth(-1).reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "discharge_notes_ordered.duplicated(['HADM_ID']).sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(52726, 11)"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "discharge_notes_ordered.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Merge Notes and Admissions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "admissions_notes = pd.merge(\n",
+    "    admissions_df[['SUBJECT_ID','HADM_ID','ADMITTIME','DISCHTIME','DAYS_TO_READMISSION','NEXT_ADMITTIME','ADMISSION_TYPE','DEATHTIME']],\n",
+    "    discharge_notes_ordered[['SUBJECT_ID', 'HADM_ID', 'TEXT']], \n",
+    "    on = ['SUBJECT_ID', 'HADM_ID'], how='left'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>SUBJECT_ID</th>\n",
+       "      <th>HADM_ID</th>\n",
+       "      <th>ADMITTIME</th>\n",
+       "      <th>DISCHTIME</th>\n",
+       "      <th>DAYS_TO_READMISSION</th>\n",
+       "      <th>NEXT_ADMITTIME</th>\n",
+       "      <th>ADMISSION_TYPE</th>\n",
+       "      <th>DEATHTIME</th>\n",
+       "      <th>TEXT</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2</td>\n",
+       "      <td>163353</td>\n",
+       "      <td>2138-07-17 19:04:00</td>\n",
+       "      <td>2138-07-21 15:48:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>NEWBORN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>3</td>\n",
+       "      <td>145834</td>\n",
+       "      <td>2101-10-20 19:08:00</td>\n",
+       "      <td>2101-10-31 13:58:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2101-10-20**]     Discharg...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>4</td>\n",
+       "      <td>185777</td>\n",
+       "      <td>2191-03-16 00:28:00</td>\n",
+       "      <td>2191-03-23 18:41:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2191-3-16**]     Discharge...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>5</td>\n",
+       "      <td>178980</td>\n",
+       "      <td>2103-02-02 04:31:00</td>\n",
+       "      <td>2103-02-04 12:15:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>NEWBORN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>6</td>\n",
+       "      <td>107064</td>\n",
+       "      <td>2175-05-30 07:15:00</td>\n",
+       "      <td>2175-06-15 16:00:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date: [**2175-5-30**]        Dischar...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>58971</th>\n",
+       "      <td>99985</td>\n",
+       "      <td>176670</td>\n",
+       "      <td>2181-01-27 02:47:00</td>\n",
+       "      <td>2181-02-12 17:05:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2181-1-27**]              ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>58972</th>\n",
+       "      <td>99991</td>\n",
+       "      <td>151118</td>\n",
+       "      <td>2184-12-24 08:30:00</td>\n",
+       "      <td>2185-01-05 12:15:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2184-12-24**]             ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>58973</th>\n",
+       "      <td>99992</td>\n",
+       "      <td>197084</td>\n",
+       "      <td>2144-07-25 18:03:00</td>\n",
+       "      <td>2144-07-28 17:56:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2144-7-25**]              ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>58974</th>\n",
+       "      <td>99995</td>\n",
+       "      <td>137810</td>\n",
+       "      <td>2147-02-08 08:00:00</td>\n",
+       "      <td>2147-02-11 13:15:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2147-2-8**]              D...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>58975</th>\n",
+       "      <td>99999</td>\n",
+       "      <td>113369</td>\n",
+       "      <td>2117-12-30 07:15:00</td>\n",
+       "      <td>2118-01-04 16:30:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2117-12-30**]             ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>58976 rows × 9 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       SUBJECT_ID  HADM_ID           ADMITTIME           DISCHTIME  \\\n",
+       "0               2   163353 2138-07-17 19:04:00 2138-07-21 15:48:00   \n",
+       "1               3   145834 2101-10-20 19:08:00 2101-10-31 13:58:00   \n",
+       "2               4   185777 2191-03-16 00:28:00 2191-03-23 18:41:00   \n",
+       "3               5   178980 2103-02-02 04:31:00 2103-02-04 12:15:00   \n",
+       "4               6   107064 2175-05-30 07:15:00 2175-06-15 16:00:00   \n",
+       "...           ...      ...                 ...                 ...   \n",
+       "58971       99985   176670 2181-01-27 02:47:00 2181-02-12 17:05:00   \n",
+       "58972       99991   151118 2184-12-24 08:30:00 2185-01-05 12:15:00   \n",
+       "58973       99992   197084 2144-07-25 18:03:00 2144-07-28 17:56:00   \n",
+       "58974       99995   137810 2147-02-08 08:00:00 2147-02-11 13:15:00   \n",
+       "58975       99999   113369 2117-12-30 07:15:00 2118-01-04 16:30:00   \n",
+       "\n",
+       "       DAYS_TO_READMISSION NEXT_ADMITTIME ADMISSION_TYPE DEATHTIME  \\\n",
+       "0                      NaN            NaT        NEWBORN       NaT   \n",
+       "1                      NaN            NaT      EMERGENCY       NaT   \n",
+       "2                      NaN            NaT      EMERGENCY       NaT   \n",
+       "3                      NaN            NaT        NEWBORN       NaT   \n",
+       "4                      NaN            NaT       ELECTIVE       NaT   \n",
+       "...                    ...            ...            ...       ...   \n",
+       "58971                  NaN            NaT      EMERGENCY       NaT   \n",
+       "58972                  NaN            NaT       ELECTIVE       NaT   \n",
+       "58973                  NaN            NaT      EMERGENCY       NaT   \n",
+       "58974                  NaN            NaT       ELECTIVE       NaT   \n",
+       "58975                  NaN            NaT       ELECTIVE       NaT   \n",
+       "\n",
+       "                                                    TEXT  \n",
+       "0                                                    NaN  \n",
+       "1      Admission Date:  [**2101-10-20**]     Discharg...  \n",
+       "2      Admission Date:  [**2191-3-16**]     Discharge...  \n",
+       "3                                                    NaN  \n",
+       "4      Admission Date: [**2175-5-30**]        Dischar...  \n",
+       "...                                                  ...  \n",
+       "58971  Admission Date:  [**2181-1-27**]              ...  \n",
+       "58972  Admission Date:  [**2184-12-24**]             ...  \n",
+       "58973  Admission Date:  [**2144-7-25**]              ...  \n",
+       "58974  Admission Date:  [**2147-2-8**]              D...  \n",
+       "58975  Admission Date:  [**2117-12-30**]             ...  \n",
+       "\n",
+       "[58976 rows x 9 columns]"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "admissions_notes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "10% of admissions without discharge notes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.1059753119913185"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sum(admissions_notes['TEXT'].isnull()) / len(admissions_notes)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "53% of NEWBORN are missing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ADMISSION_TYPE\n",
+       "ELECTIVE     0.048663\n",
+       "EMERGENCY    0.037983\n",
+       "NEWBORN      0.536691\n",
+       "URGENT       0.042665\n",
+       "dtype: float64"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "admissions_notes.groupby('ADMISSION_TYPE').apply(lambda g: g.TEXT.isnull().sum())/admissions_notes.groupby('ADMISSION_TYPE').size()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Remove NEWBORN admissions and create the target variable"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adm_notes = admissions_notes[admissions_notes['ADMISSION_TYPE'] != 'NEWBORN'].copy()\n",
+    "adm_notes['READM_WITHIN_30'] = (adm_notes['DAYS_TO_READMISSION'] < 30).astype('int')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3004"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sum(adm_notes['READM_WITHIN_30'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(51113, 10)"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adm_notes.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Exlude patients that died during the admission"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rows_not_death = adm_notes['DEATHTIME'].isnull()\n",
+    "df_adm_notes_not_death = adm_notes.loc[rows_not_death].copy()\n",
+    "df_adm_notes_not_death = df_adm_notes_not_death.sample(n = len(df_adm_notes_not_death))\n",
+    "df_adm_notes_not_death = df_adm_notes_not_death.reset_index(drop = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2963"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sum(df_adm_notes_not_death['READM_WITHIN_30'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "45321"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(df_adm_notes_not_death['READM_WITHIN_30'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data Creation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Exlude patients that died during the admission"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adm_notes = adm_notes.loc[adm_notes['DEATHTIME'].isnull()]\n",
+    "adm_notes = adm_notes.sample(n = len(adm_notes))\n",
+    "adm_notes = adm_notes.reset_index(drop = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>SUBJECT_ID</th>\n",
+       "      <th>HADM_ID</th>\n",
+       "      <th>ADMITTIME</th>\n",
+       "      <th>DISCHTIME</th>\n",
+       "      <th>DAYS_TO_READMISSION</th>\n",
+       "      <th>NEXT_ADMITTIME</th>\n",
+       "      <th>ADMISSION_TYPE</th>\n",
+       "      <th>DEATHTIME</th>\n",
+       "      <th>TEXT</th>\n",
+       "      <th>READM_WITHIN_30</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>6090</td>\n",
+       "      <td>175043</td>\n",
+       "      <td>2170-05-03 07:15:00</td>\n",
+       "      <td>2170-05-06 13:40:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2170-5-3**]              D...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>27901</td>\n",
+       "      <td>189210</td>\n",
+       "      <td>2101-06-21 07:15:00</td>\n",
+       "      <td>2101-07-13 15:00:00</td>\n",
+       "      <td>50.0</td>\n",
+       "      <td>2101-09-01 20:44:00</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2101-6-21**]              ...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>69531</td>\n",
+       "      <td>102759</td>\n",
+       "      <td>2156-08-18 23:41:00</td>\n",
+       "      <td>2156-08-26 16:45:00</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>2156-09-03 21:11:00</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2156-8-18**]              ...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>8620</td>\n",
+       "      <td>148993</td>\n",
+       "      <td>2190-02-05 17:13:00</td>\n",
+       "      <td>2190-02-09 17:53:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2190-2-5**]     Discharge ...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>24226</td>\n",
+       "      <td>118785</td>\n",
+       "      <td>2113-04-04 07:30:00</td>\n",
+       "      <td>2113-04-14 11:20:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Name:  [**Known lastname 10030**],[**Known fir...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45316</th>\n",
+       "      <td>17882</td>\n",
+       "      <td>157780</td>\n",
+       "      <td>2146-05-01 16:33:00</td>\n",
+       "      <td>2146-05-09 16:20:00</td>\n",
+       "      <td>412.0</td>\n",
+       "      <td>2147-06-25 17:42:00</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2146-5-1**]              D...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45317</th>\n",
+       "      <td>2184</td>\n",
+       "      <td>171742</td>\n",
+       "      <td>2154-04-21 19:25:00</td>\n",
+       "      <td>2154-04-25 11:49:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2154-4-21**]     Discharge...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45318</th>\n",
+       "      <td>28240</td>\n",
+       "      <td>151747</td>\n",
+       "      <td>2195-06-21 07:27:00</td>\n",
+       "      <td>2195-06-26 14:33:00</td>\n",
+       "      <td>154.0</td>\n",
+       "      <td>2195-11-28 00:15:00</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2195-6-21**]              ...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45319</th>\n",
+       "      <td>25201</td>\n",
+       "      <td>124241</td>\n",
+       "      <td>2149-06-02 08:00:00</td>\n",
+       "      <td>2149-06-11 13:00:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>ELECTIVE</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2149-6-2**]              D...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45320</th>\n",
+       "      <td>20855</td>\n",
+       "      <td>108604</td>\n",
+       "      <td>2161-07-31 17:00:00</td>\n",
+       "      <td>2161-08-20 16:00:00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>EMERGENCY</td>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Admission Date:  [**2161-7-31**]       Dischar...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>45321 rows × 10 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       SUBJECT_ID  HADM_ID           ADMITTIME           DISCHTIME  \\\n",
+       "0            6090   175043 2170-05-03 07:15:00 2170-05-06 13:40:00   \n",
+       "1           27901   189210 2101-06-21 07:15:00 2101-07-13 15:00:00   \n",
+       "2           69531   102759 2156-08-18 23:41:00 2156-08-26 16:45:00   \n",
+       "3            8620   148993 2190-02-05 17:13:00 2190-02-09 17:53:00   \n",
+       "4           24226   118785 2113-04-04 07:30:00 2113-04-14 11:20:00   \n",
+       "...           ...      ...                 ...                 ...   \n",
+       "45316       17882   157780 2146-05-01 16:33:00 2146-05-09 16:20:00   \n",
+       "45317        2184   171742 2154-04-21 19:25:00 2154-04-25 11:49:00   \n",
+       "45318       28240   151747 2195-06-21 07:27:00 2195-06-26 14:33:00   \n",
+       "45319       25201   124241 2149-06-02 08:00:00 2149-06-11 13:00:00   \n",
+       "45320       20855   108604 2161-07-31 17:00:00 2161-08-20 16:00:00   \n",
+       "\n",
+       "       DAYS_TO_READMISSION      NEXT_ADMITTIME ADMISSION_TYPE DEATHTIME  \\\n",
+       "0                      NaN                 NaT       ELECTIVE       NaT   \n",
+       "1                     50.0 2101-09-01 20:44:00       ELECTIVE       NaT   \n",
+       "2                      8.0 2156-09-03 21:11:00      EMERGENCY       NaT   \n",
+       "3                      NaN                 NaT       ELECTIVE       NaT   \n",
+       "4                      NaN                 NaT       ELECTIVE       NaT   \n",
+       "...                    ...                 ...            ...       ...   \n",
+       "45316                412.0 2147-06-25 17:42:00      EMERGENCY       NaT   \n",
+       "45317                  NaN                 NaT      EMERGENCY       NaT   \n",
+       "45318                154.0 2195-11-28 00:15:00      EMERGENCY       NaT   \n",
+       "45319                  NaN                 NaT       ELECTIVE       NaT   \n",
+       "45320                  NaN                 NaT      EMERGENCY       NaT   \n",
+       "\n",
+       "                                                    TEXT  READM_WITHIN_30  \n",
+       "0      Admission Date:  [**2170-5-3**]              D...                0  \n",
+       "1      Admission Date:  [**2101-6-21**]              ...                0  \n",
+       "2      Admission Date:  [**2156-8-18**]              ...                1  \n",
+       "3      Admission Date:  [**2190-2-5**]     Discharge ...                0  \n",
+       "4      Name:  [**Known lastname 10030**],[**Known fir...                0  \n",
+       "...                                                  ...              ...  \n",
+       "45316  Admission Date:  [**2146-5-1**]              D...                0  \n",
+       "45317  Admission Date:  [**2154-4-21**]     Discharge...                0  \n",
+       "45318  Admission Date:  [**2195-6-21**]              ...                0  \n",
+       "45319  Admission Date:  [**2149-6-2**]              D...                0  \n",
+       "45320  Admission Date:  [**2161-7-31**]       Dischar...                0  \n",
+       "\n",
+       "[45321 rows x 10 columns]"
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adm_notes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adm_notes.to_csv(DIR + 'readmission.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data Cleaning"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Natural Language"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
+      ]
+     },
+     "execution_count": 77,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "string.punctuation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_text(texts):\n",
+    "    texts = texts.fillna(' ')\n",
+    "    texts = texts.str.replace('\\n',' ')\n",
+    "    texts = texts.str.replace('\\r',' ')\n",
+    "\n",
+    "    table = str.maketrans('', '', string.punctuation + '0123456789')\n",
+    "    texts = [text.lower().translate(table) for text in texts]\n",
+    "\n",
+    "    return texts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adm_notes['TEXT'] = clean_text(adm_notes['TEXT'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stop_words = stopwords.words('english')\n",
+    "stop_words = stop_words + ['patient', 'date', 'admission', 'discharge', 'lastname', 'firstname', 'sex']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "porter = PorterStemmer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_stem(text):\n",
+    "    words = word_tokenize(text)\n",
+    "    words = [word for word in words if word not in stop_words]\n",
+    "    words = [porter.stem(word) for word in words]\n",
+    "    return words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2963\n",
+      "45321\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(sum(adm_notes['READM_WITHIN_30'] == 1) )\n",
+    "print(len(adm_notes['READM_WITHIN_30']))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}