{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Loading the data and required libraries" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import re\n", "import datetime\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", "import string\n", "import nltk\n", "from nltk import word_tokenize\n", "from nltk.stem.porter import PorterStemmer\n", "from nltk.corpus import stopwords" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "DIR = \"E:/Coding/Summer 2023/data/\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "notes_df = pd.read_csv(DIR + \"NOTEEVENTS.csv\", low_memory=False, memory_map=True)\n", "admissions_df = pd.read_csv(DIR + \"ADMISSIONS.csv\", low_memory=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Exploration\n", "## Admissions" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ROW_IDSUBJECT_IDHADM_IDADMITTIMEDISCHTIMEDEATHTIMEADMISSION_TYPEADMISSION_LOCATIONDISCHARGE_LOCATIONINSURANCELANGUAGERELIGIONMARITAL_STATUSETHNICITYEDREGTIMEEDOUTTIMEDIAGNOSISHOSPITAL_EXPIRE_FLAGHAS_CHARTEVENTS_DATA
021221653152196-04-09 12:26:002196-04-10 15:54:00NaNEMERGENCYEMERGENCY ROOM ADMITDISC-TRAN CANCER/CHLDRN HPrivateNaNUNOBTAINABLEMARRIEDWHITE2196-04-09 10:06:002196-04-09 13:24:00BENZODIAZEPINE OVERDOSE01
122231522232153-09-03 07:15:002153-09-08 19:10:00NaNELECTIVEPHYS REFERRAL/NORMAL DELIHOME HEALTH CAREMedicareNaNCATHOLICMARRIEDWHITENaNNaNCORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS...01
223231243212157-10-18 19:34:002157-10-25 14:00:00NaNEMERGENCYTRANSFER FROM HOSP/EXTRAMHOME HEALTH CAREMedicareENGLCATHOLICMARRIEDWHITENaNNaNBRAIN MASS01
324241618592139-06-06 16:14:002139-06-09 12:48:00NaNEMERGENCYTRANSFER FROM HOSP/EXTRAMHOMEPrivateNaNPROTESTANT QUAKERSINGLEWHITENaNNaNINTERIOR MYOCARDIAL INFARCTION01
425251296352160-11-02 02:06:002160-11-05 14:55:00NaNEMERGENCYEMERGENCY ROOM ADMITHOMEPrivateNaNUNOBTAINABLEMARRIEDWHITE2160-11-02 01:01:002160-11-02 04:27:00ACUTE CORONARY SYNDROME01
\n", "
" ], "text/plain": [ " ROW_ID SUBJECT_ID HADM_ID ADMITTIME DISCHTIME \\\n", "0 21 22 165315 2196-04-09 12:26:00 2196-04-10 15:54:00 \n", "1 22 23 152223 2153-09-03 07:15:00 2153-09-08 19:10:00 \n", "2 23 23 124321 2157-10-18 19:34:00 2157-10-25 14:00:00 \n", "3 24 24 161859 2139-06-06 16:14:00 2139-06-09 12:48:00 \n", "4 25 25 129635 2160-11-02 02:06:00 2160-11-05 14:55:00 \n", "\n", " DEATHTIME ADMISSION_TYPE ADMISSION_LOCATION \\\n", "0 NaN EMERGENCY EMERGENCY ROOM ADMIT \n", "1 NaN ELECTIVE PHYS REFERRAL/NORMAL DELI \n", "2 NaN EMERGENCY TRANSFER FROM HOSP/EXTRAM \n", "3 NaN EMERGENCY TRANSFER FROM HOSP/EXTRAM \n", "4 NaN EMERGENCY EMERGENCY ROOM ADMIT \n", "\n", " DISCHARGE_LOCATION INSURANCE LANGUAGE RELIGION \\\n", "0 DISC-TRAN CANCER/CHLDRN H Private NaN UNOBTAINABLE \n", "1 HOME HEALTH CARE Medicare NaN CATHOLIC \n", "2 HOME HEALTH CARE Medicare ENGL CATHOLIC \n", "3 HOME Private NaN PROTESTANT QUAKER \n", "4 HOME Private NaN UNOBTAINABLE \n", "\n", " MARITAL_STATUS ETHNICITY EDREGTIME EDOUTTIME \\\n", "0 MARRIED WHITE 2196-04-09 10:06:00 2196-04-09 13:24:00 \n", "1 MARRIED WHITE NaN NaN \n", "2 MARRIED WHITE NaN NaN \n", "3 SINGLE WHITE NaN NaN \n", "4 MARRIED WHITE 2160-11-02 01:01:00 2160-11-02 04:27:00 \n", "\n", " DIAGNOSIS HOSPITAL_EXPIRE_FLAG \\\n", "0 BENZODIAZEPINE OVERDOSE 0 \n", "1 CORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS... 0 \n", "2 BRAIN MASS 0 \n", "3 INTERIOR MYOCARDIAL INFARCTION 0 \n", "4 ACUTE CORONARY SYNDROME 0 \n", "\n", " HAS_CHARTEVENTS_DATA \n", "0 1 \n", "1 1 \n", "2 1 \n", "3 1 \n", "4 1 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "admissions_df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(58976, 19)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "admissions_df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Types of admissions" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['EMERGENCY', 'ELECTIVE', 'NEWBORN', 'URGENT'], dtype=object)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "admissions_df['ADMISSION_TYPE'].unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check for missing values on the admission times" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(admissions_df['ADMITTIME'].isnull())\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Conversion of times to datetime type" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "admissions_df['ADMITTIME'] = admissions_df['ADMITTIME'].astype('datetime64[ns]')\n", "admissions_df['DISCHTIME'] = admissions_df['DISCHTIME'].astype('datetime64[ns]')\n", "admissions_df['DEATHTIME'] = admissions_df['DEATHTIME'].astype('datetime64[ns]')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sort by subject and admission type and reset the data frame index." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "admissions_df = admissions_df.sort_values(['SUBJECT_ID', 'ADMITTIME'])\n", "admissions_df = admissions_df.reset_index(drop = True)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "admissions_df['NEXT_ADMITTIME'] = admissions_df.groupby('SUBJECT_ID').ADMITTIME.shift(-1)\n", "admissions_df['NEXT_ADMISSION_TYPE'] = admissions_df.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "admissions_df.loc[admissions_df['NEXT_ADMISSION_TYPE'] == 'ELECTIVE', 'NEXT_ADMITTIME'] = pd.NaT\n", "admissions_df.loc[admissions_df['NEXT_ADMISSION_TYPE'] == 'ELECTIVE', 'NEXT_ADMISSION_TYPE'] = np.NaN" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fill NA's with the next valid value. Previously sorted." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "admissions_df[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']] = admissions_df.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']].fillna(method = 'bfill')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Obtain days to readmission: from discharge to next readmission" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "admissions_df['DAYS_TO_READMISSION'] = (admissions_df['NEXT_ADMITTIME'] - admissions_df['DISCHTIME']).dt.days" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Number of readmissions" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "11399" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(admissions_df['DAYS_TO_READMISSION'].notnull())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Distribution of days to readmission" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.set(rc={'figure.figsize':(7,3.5), 'axes.grid':True})\n", "sns.set_style(\"whitegrid\", {'axes.grid' : False})\n", "fig = sns.histplot(admissions_df['DAYS_TO_READMISSION'], kde=False, bins=15)\n", "fig = fig.get_figure()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "120.0\n", "507.0\n", "23.0\n", "408.8103342398456\n" ] }, { "data": { "text/plain": [ "4107.0" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(admissions_df['DAYS_TO_READMISSION'].quantile(0.5))\n", "print(admissions_df['DAYS_TO_READMISSION'].quantile(0.75))\n", "print(admissions_df['DAYS_TO_READMISSION'].quantile(0.25))\n", "print(admissions_df['DAYS_TO_READMISSION'].mean())\n", "admissions_df['DAYS_TO_READMISSION'].max()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Notes" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ROW_IDSUBJECT_IDHADM_IDCHARTDATECHARTTIMESTORETIMECATEGORYDESCRIPTIONCGIDISERRORTEXT
017422532167853.02151-08-04NaNNaNDischarge summaryReportNaNNaNAdmission Date: [**2151-7-16**] Dischar...
117513702107527.02118-06-14NaNNaNDischarge summaryReportNaNNaNAdmission Date: [**2118-6-2**] Discharg...
217613702167118.02119-05-25NaNNaNDischarge summaryReportNaNNaNAdmission Date: [**2119-5-4**] D...
317713702196489.02124-08-18NaNNaNDischarge summaryReportNaNNaNAdmission Date: [**2124-7-21**] ...
417826880135453.02162-03-25NaNNaNDischarge summaryReportNaNNaNAdmission Date: [**2162-3-3**] D...
\n", "
" ], "text/plain": [ " ROW_ID SUBJECT_ID HADM_ID CHARTDATE CHARTTIME STORETIME \\\n", "0 174 22532 167853.0 2151-08-04 NaN NaN \n", "1 175 13702 107527.0 2118-06-14 NaN NaN \n", "2 176 13702 167118.0 2119-05-25 NaN NaN \n", "3 177 13702 196489.0 2124-08-18 NaN NaN \n", "4 178 26880 135453.0 2162-03-25 NaN NaN \n", "\n", " CATEGORY DESCRIPTION CGID ISERROR \\\n", "0 Discharge summary Report NaN NaN \n", "1 Discharge summary Report NaN NaN \n", "2 Discharge summary Report NaN NaN \n", "3 Discharge summary Report NaN NaN \n", "4 Discharge summary Report NaN NaN \n", "\n", " TEXT \n", "0 Admission Date: [**2151-7-16**] Dischar... \n", "1 Admission Date: [**2118-6-2**] Discharg... \n", "2 Admission Date: [**2119-5-4**] D... \n", "3 Admission Date: [**2124-7-21**] ... \n", "4 Admission Date: [**2162-3-3**] D... " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "notes_df.head()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2083180, 11)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "notes_df.shape" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Discharge summary', 'Echo', 'ECG', 'Nursing', 'Physician ',\n", " 'Rehab Services', 'Case Management ', 'Respiratory ', 'Nutrition',\n", " 'General', 'Social Work', 'Pharmacy', 'Consult', 'Radiology',\n", " 'Nursing/other'], dtype=object)" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "notes_df['CATEGORY'].unique()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(59652, 11)" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "discharge_notes = notes_df[notes_df['CATEGORY'] == \"Discharge summary\"]\n", "discharge_notes.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are 6926 admissions with more than one discharge note (HADM_ID - ID of Admissions)." ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6926" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "discharge_notes.duplicated(['HADM_ID']).sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Take the last row per admission" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "discharge_notes_ordered = discharge_notes.groupby(['SUBJECT_ID', 'HADM_ID']).nth(-1).reset_index()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "discharge_notes_ordered.duplicated(['HADM_ID']).sum()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(52726, 11)" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "discharge_notes_ordered.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Merge Notes and Admissions" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "admissions_notes = pd.merge(\n", " admissions_df[['SUBJECT_ID','HADM_ID','ADMITTIME','DISCHTIME','DAYS_TO_READMISSION','NEXT_ADMITTIME','ADMISSION_TYPE','DEATHTIME']],\n", " discharge_notes_ordered[['SUBJECT_ID', 'HADM_ID', 'TEXT']], \n", " on = ['SUBJECT_ID', 'HADM_ID'], how='left'\n", ")" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SUBJECT_IDHADM_IDADMITTIMEDISCHTIMEDAYS_TO_READMISSIONNEXT_ADMITTIMEADMISSION_TYPEDEATHTIMETEXT
021633532138-07-17 19:04:002138-07-21 15:48:00NaNNaTNEWBORNNaTNaN
131458342101-10-20 19:08:002101-10-31 13:58:00NaNNaTEMERGENCYNaTAdmission Date: [**2101-10-20**] Discharg...
241857772191-03-16 00:28:002191-03-23 18:41:00NaNNaTEMERGENCYNaTAdmission Date: [**2191-3-16**] Discharge...
351789802103-02-02 04:31:002103-02-04 12:15:00NaNNaTNEWBORNNaTNaN
461070642175-05-30 07:15:002175-06-15 16:00:00NaNNaTELECTIVENaTAdmission Date: [**2175-5-30**] Dischar...
..............................
58971999851766702181-01-27 02:47:002181-02-12 17:05:00NaNNaTEMERGENCYNaTAdmission Date: [**2181-1-27**] ...
58972999911511182184-12-24 08:30:002185-01-05 12:15:00NaNNaTELECTIVENaTAdmission Date: [**2184-12-24**] ...
58973999921970842144-07-25 18:03:002144-07-28 17:56:00NaNNaTEMERGENCYNaTAdmission Date: [**2144-7-25**] ...
58974999951378102147-02-08 08:00:002147-02-11 13:15:00NaNNaTELECTIVENaTAdmission Date: [**2147-2-8**] D...
58975999991133692117-12-30 07:15:002118-01-04 16:30:00NaNNaTELECTIVENaTAdmission Date: [**2117-12-30**] ...
\n", "

58976 rows × 9 columns

\n", "
" ], "text/plain": [ " SUBJECT_ID HADM_ID ADMITTIME DISCHTIME \\\n", "0 2 163353 2138-07-17 19:04:00 2138-07-21 15:48:00 \n", "1 3 145834 2101-10-20 19:08:00 2101-10-31 13:58:00 \n", "2 4 185777 2191-03-16 00:28:00 2191-03-23 18:41:00 \n", "3 5 178980 2103-02-02 04:31:00 2103-02-04 12:15:00 \n", "4 6 107064 2175-05-30 07:15:00 2175-06-15 16:00:00 \n", "... ... ... ... ... \n", "58971 99985 176670 2181-01-27 02:47:00 2181-02-12 17:05:00 \n", "58972 99991 151118 2184-12-24 08:30:00 2185-01-05 12:15:00 \n", "58973 99992 197084 2144-07-25 18:03:00 2144-07-28 17:56:00 \n", "58974 99995 137810 2147-02-08 08:00:00 2147-02-11 13:15:00 \n", "58975 99999 113369 2117-12-30 07:15:00 2118-01-04 16:30:00 \n", "\n", " DAYS_TO_READMISSION NEXT_ADMITTIME ADMISSION_TYPE DEATHTIME \\\n", "0 NaN NaT NEWBORN NaT \n", "1 NaN NaT EMERGENCY NaT \n", "2 NaN NaT EMERGENCY NaT \n", "3 NaN NaT NEWBORN NaT \n", "4 NaN NaT ELECTIVE NaT \n", "... ... ... ... ... \n", "58971 NaN NaT EMERGENCY NaT \n", "58972 NaN NaT ELECTIVE NaT \n", "58973 NaN NaT EMERGENCY NaT \n", "58974 NaN NaT ELECTIVE NaT \n", "58975 NaN NaT ELECTIVE NaT \n", "\n", " TEXT \n", "0 NaN \n", "1 Admission Date: [**2101-10-20**] Discharg... \n", "2 Admission Date: [**2191-3-16**] Discharge... \n", "3 NaN \n", "4 Admission Date: [**2175-5-30**] Dischar... \n", "... ... \n", "58971 Admission Date: [**2181-1-27**] ... \n", "58972 Admission Date: [**2184-12-24**] ... \n", "58973 Admission Date: [**2144-7-25**] ... \n", "58974 Admission Date: [**2147-2-8**] D... \n", "58975 Admission Date: [**2117-12-30**] ... \n", "\n", "[58976 rows x 9 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "admissions_notes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "10% of admissions without discharge notes." ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.1059753119913185" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(admissions_notes['TEXT'].isnull()) / len(admissions_notes)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "53% of NEWBORN are missing" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ADMISSION_TYPE\n", "ELECTIVE 0.048663\n", "EMERGENCY 0.037983\n", "NEWBORN 0.536691\n", "URGENT 0.042665\n", "dtype: float64" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "admissions_notes.groupby('ADMISSION_TYPE').apply(lambda g: g.TEXT.isnull().sum())/admissions_notes.groupby('ADMISSION_TYPE').size()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Remove NEWBORN admissions and create the target variable" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "adm_notes = admissions_notes[admissions_notes['ADMISSION_TYPE'] != 'NEWBORN'].copy()\n", "adm_notes['READM_WITHIN_30'] = (adm_notes['DAYS_TO_READMISSION'] < 30).astype('int')" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3004" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(adm_notes['READM_WITHIN_30'])" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(51113, 10)" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adm_notes.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Exlude patients that died during the admission" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "rows_not_death = adm_notes['DEATHTIME'].isnull()\n", "df_adm_notes_not_death = adm_notes.loc[rows_not_death].copy()\n", "df_adm_notes_not_death = df_adm_notes_not_death.sample(n = len(df_adm_notes_not_death))\n", "df_adm_notes_not_death = df_adm_notes_not_death.reset_index(drop = True)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2963" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(df_adm_notes_not_death['READM_WITHIN_30'])" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "45321" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df_adm_notes_not_death['READM_WITHIN_30'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Creation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Exlude patients that died during the admission" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "adm_notes = adm_notes.loc[adm_notes['DEATHTIME'].isnull()]\n", "adm_notes = adm_notes.sample(n = len(adm_notes))\n", "adm_notes = adm_notes.reset_index(drop = True)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SUBJECT_IDHADM_IDADMITTIMEDISCHTIMEDAYS_TO_READMISSIONNEXT_ADMITTIMEADMISSION_TYPEDEATHTIMETEXTREADM_WITHIN_30
060901750432170-05-03 07:15:002170-05-06 13:40:00NaNNaTELECTIVENaTAdmission Date: [**2170-5-3**] D...0
1279011892102101-06-21 07:15:002101-07-13 15:00:0050.02101-09-01 20:44:00ELECTIVENaTAdmission Date: [**2101-6-21**] ...0
2695311027592156-08-18 23:41:002156-08-26 16:45:008.02156-09-03 21:11:00EMERGENCYNaTAdmission Date: [**2156-8-18**] ...1
386201489932190-02-05 17:13:002190-02-09 17:53:00NaNNaTELECTIVENaTAdmission Date: [**2190-2-5**] Discharge ...0
4242261187852113-04-04 07:30:002113-04-14 11:20:00NaNNaTELECTIVENaTName: [**Known lastname 10030**],[**Known fir...0
.................................
45316178821577802146-05-01 16:33:002146-05-09 16:20:00412.02147-06-25 17:42:00EMERGENCYNaTAdmission Date: [**2146-5-1**] D...0
4531721841717422154-04-21 19:25:002154-04-25 11:49:00NaNNaTEMERGENCYNaTAdmission Date: [**2154-4-21**] Discharge...0
45318282401517472195-06-21 07:27:002195-06-26 14:33:00154.02195-11-28 00:15:00EMERGENCYNaTAdmission Date: [**2195-6-21**] ...0
45319252011242412149-06-02 08:00:002149-06-11 13:00:00NaNNaTELECTIVENaTAdmission Date: [**2149-6-2**] D...0
45320208551086042161-07-31 17:00:002161-08-20 16:00:00NaNNaTEMERGENCYNaTAdmission Date: [**2161-7-31**] Dischar...0
\n", "

45321 rows × 10 columns

\n", "
" ], "text/plain": [ " SUBJECT_ID HADM_ID ADMITTIME DISCHTIME \\\n", "0 6090 175043 2170-05-03 07:15:00 2170-05-06 13:40:00 \n", "1 27901 189210 2101-06-21 07:15:00 2101-07-13 15:00:00 \n", "2 69531 102759 2156-08-18 23:41:00 2156-08-26 16:45:00 \n", "3 8620 148993 2190-02-05 17:13:00 2190-02-09 17:53:00 \n", "4 24226 118785 2113-04-04 07:30:00 2113-04-14 11:20:00 \n", "... ... ... ... ... \n", "45316 17882 157780 2146-05-01 16:33:00 2146-05-09 16:20:00 \n", "45317 2184 171742 2154-04-21 19:25:00 2154-04-25 11:49:00 \n", "45318 28240 151747 2195-06-21 07:27:00 2195-06-26 14:33:00 \n", "45319 25201 124241 2149-06-02 08:00:00 2149-06-11 13:00:00 \n", "45320 20855 108604 2161-07-31 17:00:00 2161-08-20 16:00:00 \n", "\n", " DAYS_TO_READMISSION NEXT_ADMITTIME ADMISSION_TYPE DEATHTIME \\\n", "0 NaN NaT ELECTIVE NaT \n", "1 50.0 2101-09-01 20:44:00 ELECTIVE NaT \n", "2 8.0 2156-09-03 21:11:00 EMERGENCY NaT \n", "3 NaN NaT ELECTIVE NaT \n", "4 NaN NaT ELECTIVE NaT \n", "... ... ... ... ... \n", "45316 412.0 2147-06-25 17:42:00 EMERGENCY NaT \n", "45317 NaN NaT EMERGENCY NaT \n", "45318 154.0 2195-11-28 00:15:00 EMERGENCY NaT \n", "45319 NaN NaT ELECTIVE NaT \n", "45320 NaN NaT EMERGENCY NaT \n", "\n", " TEXT READM_WITHIN_30 \n", "0 Admission Date: [**2170-5-3**] D... 0 \n", "1 Admission Date: [**2101-6-21**] ... 0 \n", "2 Admission Date: [**2156-8-18**] ... 1 \n", "3 Admission Date: [**2190-2-5**] Discharge ... 0 \n", "4 Name: [**Known lastname 10030**],[**Known fir... 0 \n", "... ... ... \n", "45316 Admission Date: [**2146-5-1**] D... 0 \n", "45317 Admission Date: [**2154-4-21**] Discharge... 0 \n", "45318 Admission Date: [**2195-6-21**] ... 0 \n", "45319 Admission Date: [**2149-6-2**] D... 0 \n", "45320 Admission Date: [**2161-7-31**] Dischar... 0 \n", "\n", "[45321 rows x 10 columns]" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adm_notes" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "adm_notes.to_csv(DIR + 'readmission.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Cleaning" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Natural Language" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.punctuation" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "def clean_text(texts):\n", " texts = texts.fillna(' ')\n", " texts = texts.str.replace('\\n',' ')\n", " texts = texts.str.replace('\\r',' ')\n", "\n", " table = str.maketrans('', '', string.punctuation + '0123456789')\n", " texts = [text.lower().translate(table) for text in texts]\n", "\n", " return texts" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "adm_notes['TEXT'] = clean_text(adm_notes['TEXT'])" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "stop_words = stopwords.words('english')\n", "stop_words = stop_words + ['patient', 'date', 'admission', 'discharge', 'lastname', 'firstname', 'sex']" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "porter = PorterStemmer()" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "def tokenize_stem(text):\n", " words = word_tokenize(text)\n", " words = [word for word in words if word not in stop_words]\n", " words = [porter.stem(word) for word in words]\n", " return words" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2963\n", "45321\n" ] } ], "source": [ "print(sum(adm_notes['READM_WITHIN_30'] == 1) )\n", "print(len(adm_notes['READM_WITHIN_30']))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }