607 lines (606 with data), 22.1 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import string\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# File names\n",
"DATA_PATH = \"~/path/to/data.csv\"\n",
"WRITE_DATA_TO = \"\"\n",
"\n",
"# Column names\n",
"TEXT_COLNAME = \"Text\"\n",
"OUTCOME = \"ReadmissionInLessThan30Days\"\n",
"VISIT_ID = \"ChartGUID\"\n",
"\n",
"dataframe = pd.read_csv(DATA_PATH, dtype = {\"ChartGUID\":str, \"ClientGUID\":str})\n",
"text = dataframe[TEXT_COLNAME]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Remove fluff "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Dates often appear as part of fluff, e.g. \"admission date: XXX\" or \"dictated: XXX\".\n",
"MONTH = \"[jJ]anuary|[fF]ebruary|[mM]arch|[aA]pril|[mM]ay|[jJ]une|\"\n",
"MONTH += \"[jJ]uly|[aA]ugust|[sS]eptember|[oO]ctober|[nN]ovember|[dD]ecember\"\n",
"MONTH = \"(\" + MONTH + \"|[jJ]an|[fF]eb|[mM]ar|[aA]pr|[jJ]un|[jJ]ul|[aA]ug|[sS]ept|[oO]ct|[nN]ov|[dD]ec)\"\n",
"DATENOY = \"(\" + MONTH + \"\\s)(\\d+)\" # june 5\n",
"DATEWDS = \"((\" + MONTH + \"\\s)(\\d+)(\\s|\\,)\\s(\\d+))\" # june 5, 1995\n",
"DATEWDS_EU = \"((\\d+\\s*)\" + MONTH + \"(\\s|\\,)\\s(\\d+))\" # 5 june, 1995\n",
"DATENMS = \"((\\d{1,2}(\\/|-))+\\d{1,2}(\\/\\s?|-)\\d{2,4})\" # 06/05/1995, 06-05-1995\n",
"DATE = \"(\" + DATEWDS + \"|\" + DATEWDS_EU + \"|\" + DATENMS + \"|\" + DATENOY + \")\"\n",
"\n",
"\n",
"# Preamble fluff\n",
"sarmem_re = r'(smh( hospital)?|sarasota memorial hospital( -)?|sarasota, fl(\\.)?)'\n",
"dee_re = r'(discharge summary|emergency admission h&p|ecc to admi(t|ssion) h&p)'\n",
"patient_re = r'((pt:|patient name:)(\\s?)(\\w+),(\\s?)(\\w+)(\\s\\w(\\.)?\\s)?)'\n",
"visitid_re = r'(patient visit id[#: ]+\\d+)'\n",
"pinfo_re = r'((dob: ' + DATE + r')|(mrn: \\d+)|(acct: \\d+))'\n",
"admdate_re = r'((date of admission|admission date|admitting date|admit date)\\s*:\\s*' + DATE + ')'\n",
"dischdate_re = r'((date of discharge|discharge date)\\s*:\\s*' + DATE + ')'\n",
"dischnodate_re = r'(date of discharge: [^\\d])'\n",
"hospitalist_re = r'((medical )?hospitalist(s)?( group| program)?(\\.)?)'\n",
"paw_re = r'(patient:[a-z, \\d:]*account:[ \\d]+work type: discharge summary(\\.)?)'\n",
"pid_re = r'((p)?atient\\s*)?(visit(ation)?\\s*)?id(entification)?\\s*(number|#)(:)?\\s*(#)?(\\s*\\d+)?'\n",
"\n",
"preamble_re = sarmem_re + '|' + dee_re + '|' + patient_re + '|' + pinfo_re \n",
"preamble_re += '|' + admdate_re + '|' + dischdate_re + '|' + dischnodate_re \n",
"preamble_re += '|' + hospitalist_re + '|' + paw_re + '|' + pid_re\n",
"preamble_re = '(' + preamble_re + ')'\n",
"\n",
"# Body fluff\n",
"thisadm_re = ' this admission' # usually 'history of this admission:', 'operation performed this admission:', etc.\n",
"disdate_re = 'discharge date:'\n",
"dos_re = '(date of service\\s*:\\s*' + DATE + ')'\n",
"dod_re = '(date of dictation\\s*:\\s*' + DATE + ')'\n",
"doe_re = '(date of evaluation\\s*:\\s*' + DATE + ')'\n",
"body_re = '(' + thisadm_re + '|' + disdate_re + '|' + dos_re + ')'\n",
"\n",
"# End fluff\n",
"dictated_re = r'((\\sd:( )?' + '(error|' + DATE + '))|dictated for:)'\n",
"cc_re = r'(\\scc:)'\n",
"end_re = '((' + dictated_re + '|' + cc_re + ').*)'\n",
"\n",
"# Combine all fluff into one expression:\n",
"fluff_re = '(' + preamble_re + '|' + body_re + '|' + end_re + ')'"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Remove fluff from the text\n",
"text = text.str.replace(fluff_re, \"\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Find section headers\n",
"\n",
"We count instances of two words, 3-letter or longer, where the second word is followed by a colon (:).\n",
"\n",
"If the first word is followed by a period (.) we omit it.\n",
"\n",
"### Bug: We don't handle one-word headers at the start of the text, i.e., not preceded by '. '"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"headers = text.str.extractall('( \\w{3,})(\\.)?( |-)(\\w{3,}:)')\n",
"hpairs = pd.concat([headers[0].where(headers[1] != '.','.'), headers[3]],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>3</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>match</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">0</th>\n",
" <th>0</th>\n",
" <td>medical</td>\n",
" <td>history:</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>preadmission</td>\n",
" <td>medications:</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>laboratory</td>\n",
" <td>data:</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>physical</td>\n",
" <td>examination:</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>vital</td>\n",
" <td>signs:</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 3\n",
" match \n",
"0 0 medical history:\n",
" 1 preadmission medications:\n",
" 2 laboratory data:\n",
" 3 physical examination:\n",
" 4 vital signs:"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hpairs.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
" hospital course: 10325\n",
" discharge diagnoses: 6442\n",
" discharge medications: 5439\n",
" present illness: 4177\n",
". disposition: 2615\n",
" physical examination: 2338\n",
" vital signs: 2122\n",
". abdomen: 1940\n",
". extremities: 1909\n",
" discharge instructions: 1783\n",
" discharge diagnosis: 1753\n",
" laboratory data: 1559\n",
". lungs: 1391\n",
". medications: 1389\n",
". procedures: 1378\n",
". diet: 1355\n",
". followup: 1351\n",
" final diagnoses: 1199\n",
". heart: 1175\n",
". consultations: 1128\n",
". neck: 1070\n",
". activity: 1023\n",
". plan: 1022\n",
". procedure: 965\n",
". consultants: 917\n",
" admitting diagnosis: 835\n",
" medical history: 810\n",
" for admission: 810\n",
". heent: 801\n",
" admission diagnosis: 792\n",
" ... \n",
" data demonstrated: 1\n",
" report today: 1\n",
" physician time: 1\n",
" carotid ultrasound: 1\n",
" given for: 1\n",
" final laboratory: 1\n",
" and intolerances: 1\n",
" cardiac rehab: 1\n",
" the include: 1\n",
". liver: 1\n",
" cardiology consultation: 1\n",
" vital sign: 1\n",
" admission problems: 1\n",
" lobe pneumonia: 1\n",
" june 30th: 1\n",
" test procedures: 1\n",
" discharge instrucitons: 1\n",
" operative pathology: 1\n",
" enterococcus cholangitis: 1\n",
". lymphoid: 1\n",
" sheet plan: 1\n",
" discharge day: 1\n",
" original laboratory: 1\n",
" admittign phsyician: 1\n",
" postoperative laboratory: 1\n",
" postoperative incidences: 1\n",
" this institution: 1\n",
" tentative diagnosis: 1\n",
" hospital consult: 1\n",
" work plan: 1\n",
"dtype: int64"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Here are all the headers we find.\n",
"(hpairs[hpairs.columns[0]] + ' ' + hpairs[hpairs.columns[1]]).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# We need to select from the headers above:\n",
"@np.vectorize\n",
"def parseHeader(h0, h1, h2, h3, h4, h5):\n",
" if h5 == '.':\n",
" return \"\"\n",
" if h3 == '.':\n",
" return h4\n",
" if h1 == '.':\n",
" return h2 + ' ' + h4\n",
" else: \n",
" return h0 + ' ' + h2 + ' ' + h4\n",
" \n",
"headers = text.str.extractall('(\\s+\\w+)(\\.)?(\\s+\\w+)(\\.)?(\\s+\\w+)(\\.)?\\s+present\\s+illness:')\n",
"htuples = pd.Series(parseHeader(headers[0], headers[1], headers[2], headers[3], headers[4], headers[5]))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((3356,), (11,))"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"htuples.shape, htuples[htuples.str.find('history')==-1].shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Handmade list of section headers of text, based on the output above.\n",
"\n",
"There are about 60 \"header names\" but some of them seem to be synonyms, like 'brief history:' and '. history:'.\n",
"So, I've made a dictionary of header_name : header pairs.\n",
"\n",
"Single spaces seem to be as good as \\s+'s, but it would be better to replace the \\s+'s with \" \"'s first."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"std_headers = {'hospital course:': 'course', \n",
" 'discharge diagnoses:' : 'discharge diagnoses', \n",
" 'discharge medications:' : 'discharge medications', \n",
" 'present illness:' : 'present illness', \n",
" '. disposition:' : 'disposition', # discharge disposition and instructions\n",
" 'physical examination:' : 'physical examination', # often a list, different from h&p\n",
" 'discharge diagnosis:' : 'discharge diagnoses', \n",
" 'vital signs:' : 'vital signs',\n",
" '. abdomen:' : 'abdomen', \n",
" '. extremities:' : 'extremities', \n",
" 'discharge instructions:' : 'disposition',\n",
" 'laboratory data:' : 'laboratory data', \n",
" '. procedures:' : 'procedures', \n",
" 'final diagnoses:' : 'discharge diagnoses',\n",
" '. lungs:' : 'lungs', \n",
" '. medications:' : 'discharge medications', # looks more common than 'current meds'\n",
" '. diet:' : 'discharge diet', # recommended at discharge\n",
" '. heart:' : 'heart',\n",
" '. followup:' : 'disposition',\n",
" '. neck:' : 'neck', \n",
" '. plan:' : 'disposition', \n",
" 'medical history:' : 'history',\n",
" '. consultations:' : 'consultations', # useful? quality of care indicator?\n",
" 'for admission:' : 'history', # reason for admission, looks reasonable\n",
" 'admission diagnosis:' : 'admission diagnoses', \n",
" 'admitting diagnosis:' : 'admission diagnoses', \n",
" '. procedure:' : 'procedures',\n",
" '. activity:' : 'activity', # recommended activity level, could be 'disposition' \n",
" '. consultants:' : 'consultations', # useful? quality of care indicator?\n",
" 'final diagnosis:' : 'discharge diagnoses',\n",
" '. history:' : 'history', \n",
" 'procedures performed:' : 'procedures',\n",
" 'admission diagnoses:' : 'admission diagnoses', \n",
" 'admitting diagnoses:' : 'admission diagnoses', \n",
" '. heent:' : 'heent', # head, ears, eyes, nose, and throat\n",
" 'discharge condition:' : 'discharge condition', # usually 'improved' or 'stable'\n",
" '. allergies:' : 'allergies', \n",
" 'procedure performed:' : 'procedures performed',\n",
" 'observation details:' : 'history', # haven't checked, looks like physical exam \n",
" 'brief history:' : 'history', \n",
" '. chest:' : 'chest', \n",
" '. complications:' : 'complications', \n",
" '. cardiovascular:' : 'heart',\n",
" 'principal diagnosis:' : 'admission diagnoses', # haven't checked if this is reasonable\n",
" '. neurologic' : 'neurologic', \n",
" 'social history:' : 'social history', \n",
" 'and physical:' : 'history',\n",
" 'chief complaint:' : 'history', # sometimes 'history of chief complaint', sometimes like admission diagnosis\n",
" 'core measures:' : 'core measures', # seems to be risk of heart attack at discharge, different from 'heart' and 'cardiac' \n",
" 'discharge disposition:' : 'disposition', # discharge disposition and instructions\n",
" 'following medications:' : 'discharge medications', # looks reasonable, 'discharged with the ...'\n",
" 'discharge diet:' : 'discharge diet', \n",
" '. cardiac:' : 'heart', \n",
" '. general:' : 'general', # part of physical exam \n",
" '. prognosis:' : 'prognosis', # often without colon, as in \"poor prognosis\"\n",
" 'principal diagnoses:' : 'admission diagnoses', # haven't checked if this is reasonable\n",
" '. activities' : 'activity', # recommended at discharge, could be 'disposition'\n",
" '. condition' : 'discharge condition',\n",
" 'laboratory studies:' : 'laboratory data', \n",
" 'discharge plan:' : 'disposition',\n",
" 'postoperative diagnosis:' : 'discharge diagnoses', \n",
" 'preoperative diagnosis:' : 'admission diagnoses',\n",
" # plus some one-word entries that begin the text after fluff was removed\n",
" ' diagnoses:' : 'admission diagnoses',\n",
" ' diagnosis:' : 'admission diagnoses'}"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# Collect the locations of header names in each row of text (-1 for not present) \n",
"hdr_indices = pd.DataFrame()\n",
"for hdrname in std_headers:\n",
" hdr_indices[hdrname] = text.str.find(hdrname)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# produces a dictionary of header : text pairs\n",
"#\n",
"# Input:\n",
"#\n",
"# t: text\n",
"# h: array of indices, indexed by header names. h[n] = index in t of first occurrence of n\n",
"#\n",
"# Global: std_headers\n",
"#\n",
"# Output:\n",
"#\n",
"# a dictionary of header : text pairs\n",
"#\n",
"#\n",
"def makeSections(t, h):\n",
" s = std_headers\n",
" \n",
" # keep any text before the first header\n",
" emptyhdr = ''\n",
" th = list()\n",
"\n",
" # get indices of header names in text\n",
" for (name, hdr) in s.items():\n",
" if h[name] >= 0:\n",
" th.append( (h[name], name) )\n",
" \n",
" # if there are no headers in the text, return the whole text\n",
" if len(th) == 0:\n",
" return dict({'': t})\n",
" \n",
" # sort them\n",
" th.sort()\n",
" \n",
" # make the dictionary of header : text pairs\n",
" d = dict()\n",
" (previdx, prevname) = (0, emptyhdr)\n",
" for (idx, name) in th:\n",
" prevhdr = s.get(prevname, emptyhdr)\n",
" if prevhdr in d:\n",
" d[prevhdr] += '. ' + t[previdx+len(prevname):idx]\n",
" else :\n",
" d[prevhdr] = t[previdx+len(prevname):idx]\n",
" (previdx, prevname) = (idx, name)\n",
" lastidx = th[len(th)-1][0]\n",
" lastname = th[len(th)-1][1]\n",
" lasthdr = s.get(lastname, emptyhdr)\n",
" if lasthdr in d:\n",
" d[lasthdr] += '. ' + t[lastidx+len(lastname):]\n",
" else:\n",
" d[lasthdr] = t[lastidx+len(lastname):]\n",
" \n",
" return d"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"textSectionsList = []\n",
"for row in range(text.count()):\n",
" textSectionsList.append(makeSections(text.loc[row], hdr_indices.loc[row]))\n",
" \n",
"textSections = pd.DataFrame(textSectionsList)\n",
"\n",
"# Re-link text with ChartGUID:\n",
"textSections[VISIT_ID] = dataframe[VISIT_ID]\n",
"textSections[OUTCOME] = dataframe[OUTCOME]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>abdomen</th>\n",
" <th>activity</th>\n",
" <th>admission diagnoses</th>\n",
" <th>allergies</th>\n",
" <th>chest</th>\n",
" <th>complications</th>\n",
" <th>consultations</th>\n",
" <th>core measures</th>\n",
" <th>course</th>\n",
" <th>...</th>\n",
" <th>neck</th>\n",
" <th>neurologic</th>\n",
" <th>physical examination</th>\n",
" <th>present illness</th>\n",
" <th>procedures</th>\n",
" <th>procedures performed</th>\n",
" <th>prognosis</th>\n",
" <th>social history</th>\n",
" <th>vital signs</th>\n",
" <th>ChartGUID</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"<p>0 rows × 32 columns</p>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [, abdomen, activity, admission diagnoses, allergies, chest, complications, consultations, core measures, course, discharge condition, discharge diagnoses, discharge diet, discharge medications, disposition, extremities, general, heart, heent, history, laboratory data, lungs, neck, neurologic, physical examination, present illness, procedures, procedures performed, prognosis, social history, vital signs, ChartGUID]\n",
"Index: []\n",
"\n",
"[0 rows x 32 columns]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Showing data frame structure (content omitted to preserve anonymity)\n",
"textSections.head(0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"textSections.to_csv(WRITE_DATA_TO + \".csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}