In [None]:
import pandas as pd
import string
import numpy as np

In [None]:
# File names
DATA_PATH     = "~/path/to/data.csv"
WRITE_DATA_TO = ""

# Column names
TEXT_COLNAME  = "Text"
OUTCOME       = "ReadmissionInLessThan30Days"
VISIT_ID      = "ChartGUID"

dataframe = pd.read_csv(DATA_PATH, dtype = {"ChartGUID":str, "ClientGUID":str})
text      = dataframe[TEXT_COLNAME]

# Remove fluff 

In [12]:
# Dates often appear as part of fluff, e.g. "admission date: XXX" or "dictated: XXX".
MONTH   = "[jJ]anuary|[fF]ebruary|[mM]arch|[aA]pril|[mM]ay|[jJ]une|"
MONTH  += "[jJ]uly|[aA]ugust|[sS]eptember|[oO]ctober|[nN]ovember|[dD]ecember"
MONTH   = "(" + MONTH + "|[jJ]an|[fF]eb|[mM]ar|[aA]pr|[jJ]un|[jJ]ul|[aA]ug|[sS]ept|[oO]ct|[nN]ov|[dD]ec)"
DATENOY = "(" + MONTH + "\s)(\d+)"                            # june 5
DATEWDS = "((" + MONTH + "\s)(\d+)(\s|\,)\s(\d+))"            # june 5, 1995
DATEWDS_EU = "((\d+\s*)" + MONTH + "(\s|\,)\s(\d+))"          # 5 june, 1995
DATENMS = "((\d{1,2}(\/|-))+\d{1,2}(\/\s?|-)\d{2,4})"            # 06/05/1995, 06-05-1995
DATE    = "(" + DATEWDS + "|" + DATEWDS_EU + "|" + DATENMS + "|" + DATENOY + ")"


# Preamble fluff
sarmem_re = r'(smh( hospital)?|sarasota memorial hospital( -)?|sarasota, fl(\.)?)'
dee_re = r'(discharge summary|emergency admission h&p|ecc to admi(t|ssion) h&p)'
patient_re = r'((pt:|patient name:)(\s?)(\w+),(\s?)(\w+)(\s\w(\.)?\s)?)'
visitid_re = r'(patient visit id[#: ]+\d+)'
pinfo_re = r'((dob: ' + DATE + r')|(mrn: \d+)|(acct: \d+))'
admdate_re = r'((date of admission|admission date|admitting date|admit date)\s*:\s*' + DATE + ')'
dischdate_re = r'((date of discharge|discharge date)\s*:\s*' + DATE + ')'
dischnodate_re = r'(date of discharge: [^\d])'
hospitalist_re = r'((medical )?hospitalist(s)?( group| program)?(\.)?)'
paw_re = r'(patient:[a-z, \d:]*account:[ \d]+work type: discharge summary(\.)?)'
pid_re = r'((p)?atient\s*)?(visit(ation)?\s*)?id(entification)?\s*(number|#)(:)?\s*(#)?(\s*\d+)?'

preamble_re = sarmem_re + '|' + dee_re + '|' + patient_re + '|' + pinfo_re  
preamble_re += '|' + admdate_re + '|' + dischdate_re + '|' + dischnodate_re 
preamble_re += '|' + hospitalist_re + '|' + paw_re + '|' + pid_re
preamble_re = '(' + preamble_re + ')'

# Body fluff
thisadm_re = ' this admission' # usually 'history of this admission:', 'operation performed this admission:', etc.
disdate_re = 'discharge date:'
dos_re = '(date of service\s*:\s*' + DATE + ')'
dod_re = '(date of dictation\s*:\s*' + DATE + ')'
doe_re = '(date of evaluation\s*:\s*' + DATE + ')'
body_re = '(' + thisadm_re + '|' + disdate_re + '|' + dos_re + ')'

# End fluff
dictated_re = r'((\sd:( )?' + '(error|' + DATE + '))|dictated for:)'
cc_re = r'(\scc:)'
end_re = '((' + dictated_re + '|' + cc_re + ').*)'

# Combine all fluff into one expression:
fluff_re = '(' + preamble_re + '|' + body_re + '|' + end_re + ')'

In [13]:
# Remove fluff from the text
text = text.str.replace(fluff_re, "")

# Find section headers

We count instances of two words, 3-letter or longer, where the second word is followed by a colon (:).

If the first word is followed by a period (.) we omit it.

### Bug: We don't handle one-word headers at the start of the text, i.e., not preceded by '. '

In [14]:
headers = text.str.extractall('( \w{3,})(\.)?( |-)(\w{3,}:)')
hpairs  = pd.concat([headers[0].where(headers[1] != '.','.'), headers[3]],axis=1)

In [16]:
hpairs.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,0,3
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,medical,history:
0,1,preadmission,medications:
0,2,laboratory,data:
0,3,physical,examination:
0,4,vital,signs:


In [17]:
# Here are all the headers we find.
(hpairs[hpairs.columns[0]] + ' ' + hpairs[hpairs.columns[1]]).value_counts()

 hospital course:             10325
 discharge diagnoses:          6442
 discharge medications:        5439
 present illness:              4177
. disposition:                 2615
 physical examination:         2338
 vital signs:                  2122
. abdomen:                     1940
. extremities:                 1909
 discharge instructions:       1783
 discharge diagnosis:          1753
 laboratory data:              1559
. lungs:                       1391
. medications:                 1389
. procedures:                  1378
. diet:                        1355
. followup:                    1351
 final diagnoses:              1199
. heart:                       1175
. consultations:               1128
. neck:                        1070
. activity:                    1023
. plan:                        1022
. procedure:                    965
. consultants:                  917
 admitting diagnosis:           835
 medical history:               810
 for admission:             

In [19]:
# We need to select from the headers above:
@np.vectorize
def parseHeader(h0, h1, h2, h3, h4, h5):
    if h5 == '.':
        return ""
    if h3 == '.':
        return h4
    if h1 == '.':
        return h2 + ' ' + h4
    else: 
        return h0 + ' ' + h2 + ' ' + h4
    
headers = text.str.extractall('(\s+\w+)(\.)?(\s+\w+)(\.)?(\s+\w+)(\.)?\s+present\s+illness:')
htuples = pd.Series(parseHeader(headers[0], headers[1], headers[2], headers[3], headers[4], headers[5]))

In [21]:
htuples.shape, htuples[htuples.str.find('history')==-1].shape

((3356,), (11,))

### Handmade list of section headers of text, based on the output above.

There are about 60 "header names" but some of them seem to be synonyms, like 'brief history:' and '. history:'.
So, I've made a dictionary of header_name : header pairs.

Single spaces seem to be as good as \s+'s, but it would be better to replace the \s+'s with " "'s first.

In [22]:
std_headers = {'hospital course:': 'course', 
           'discharge diagnoses:' : 'discharge diagnoses', 
           'discharge medications:' : 'discharge medications', 
           'present illness:' : 'present illness', 
           '. disposition:' : 'disposition', # discharge disposition and instructions
           'physical examination:' : 'physical examination', # often a list, different from h&p
           'discharge diagnosis:' : 'discharge diagnoses', 
           'vital signs:' : 'vital signs',
           '. abdomen:' : 'abdomen', 
           '. extremities:' : 'extremities', 
           'discharge instructions:' : 'disposition',
           'laboratory data:' : 'laboratory data', 
           '. procedures:' : 'procedures', 
           'final diagnoses:' : 'discharge diagnoses',
           '. lungs:' : 'lungs', 
           '. medications:' : 'discharge medications', # looks more common than 'current meds'
           '. diet:' : 'discharge diet', # recommended at discharge
           '. heart:' : 'heart',
           '. followup:' : 'disposition',
           '. neck:' : 'neck', 
           '. plan:' : 'disposition', 
           'medical history:' : 'history',
           '. consultations:' : 'consultations', # useful? quality of care indicator?
           'for admission:' : 'history', # reason for admission, looks reasonable
           'admission diagnosis:' : 'admission diagnoses', 
           'admitting diagnosis:' : 'admission diagnoses', 
           '. procedure:' : 'procedures',
           '. activity:' : 'activity', # recommended activity level, could be 'disposition' 
           '. consultants:' : 'consultations', # useful? quality of care indicator?
           'final diagnosis:' : 'discharge diagnoses',
           '. history:' : 'history', 
           'procedures performed:' : 'procedures',
           'admission diagnoses:' : 'admission diagnoses', 
           'admitting diagnoses:' : 'admission diagnoses', 
           '. heent:' : 'heent', # head, ears, eyes, nose, and throat
           'discharge condition:' : 'discharge condition', # usually 'improved' or 'stable'
           '. allergies:' : 'allergies', 
           'procedure performed:' : 'procedures performed',
           'observation details:' : 'history', # haven't checked, looks like physical exam 
           'brief history:' : 'history', 
           '. chest:' : 'chest', 
           '. complications:' : 'complications', 
           '. cardiovascular:' : 'heart',
           'principal diagnosis:' : 'admission diagnoses', # haven't checked if this is reasonable
           '. neurologic' : 'neurologic', 
           'social history:' : 'social history', 
           'and physical:' : 'history',
           'chief complaint:' : 'history', # sometimes 'history of chief complaint', sometimes like admission diagnosis
           'core measures:' : 'core measures', # seems to be risk of heart attack at discharge, different from 'heart' and 'cardiac' 
           'discharge disposition:' : 'disposition', # discharge disposition and instructions
           'following medications:' : 'discharge medications', # looks reasonable, 'discharged with the ...'
           'discharge diet:' : 'discharge diet', 
           '. cardiac:' : 'heart', 
           '. general:' : 'general', # part of physical exam 
           '. prognosis:' : 'prognosis', # often without colon, as in "poor prognosis"
           'principal diagnoses:' : 'admission diagnoses', # haven't checked if this is reasonable
           '. activities' : 'activity', # recommended at discharge, could be 'disposition'
           '. condition' : 'discharge condition',
           'laboratory studies:' : 'laboratory data', 
           'discharge plan:' : 'disposition',
           'postoperative diagnosis:' : 'discharge diagnoses', 
           'preoperative diagnosis:' : 'admission diagnoses',
           # plus some one-word entries that begin the text after fluff was removed
           '  diagnoses:' : 'admission diagnoses',
           '  diagnosis:' : 'admission diagnoses'}

In [24]:
# Collect the locations of header names in each row of text (-1 for not present) 
hdr_indices = pd.DataFrame()
for hdrname in std_headers:
    hdr_indices[hdrname] = text.str.find(hdrname)

In [25]:
# produces a dictionary of header : text pairs
#
# Input:
#
#  t: text
#  h: array of indices, indexed by header names. h[n] = index in t of first occurrence of n
#
# Global: std_headers
#
# Output:
#
#  a dictionary of header : text pairs
#
#
def makeSections(t, h):
    s = std_headers
    
    # keep any text before the first header
    emptyhdr = ''
    th = list()

    # get indices of header names in text
    for (name, hdr) in s.items():
        if h[name] >= 0:
            th.append( (h[name], name) )
            
    # if there are no headers in the text, return the whole text
    if len(th) == 0:
        return dict({'': t})
            
    # sort them
    th.sort()
    
    # make the dictionary of header : text pairs
    d = dict()
    (previdx, prevname) = (0, emptyhdr)
    for (idx, name) in th:
        prevhdr = s.get(prevname, emptyhdr)
        if prevhdr in d:
            d[prevhdr] += '. ' + t[previdx+len(prevname):idx]
        else :
            d[prevhdr] = t[previdx+len(prevname):idx]
        (previdx, prevname) = (idx, name)
    lastidx = th[len(th)-1][0]
    lastname = th[len(th)-1][1]
    lasthdr = s.get(lastname, emptyhdr)
    if lasthdr in d:
        d[lasthdr] += '. ' + t[lastidx+len(lastname):]
    else:
        d[lasthdr] = t[lastidx+len(lastname):]
        
    return d

In [26]:
textSectionsList = []
for row in range(text.count()):
    textSectionsList.append(makeSections(text.loc[row], hdr_indices.loc[row]))
    
textSections = pd.DataFrame(textSectionsList)

# Re-link text with ChartGUID:
textSections[VISIT_ID] = dataframe[VISIT_ID]
textSections[OUTCOME]  = dataframe[OUTCOME]

In [28]:
# Showing data frame structure (content omitted to preserve anonymity)
textSections.head(0)

Unnamed: 0,Unnamed: 1,abdomen,activity,admission diagnoses,allergies,chest,complications,consultations,core measures,course,...,neck,neurologic,physical examination,present illness,procedures,procedures performed,prognosis,social history,vital signs,ChartGUID


In [None]:
textSections.to_csv(WRITE_DATA_TO + ".csv", index=False)