In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Overview

## Clean Text
Our goal is to 'clean up' the discharge text with the goal of making it easier for our model to find meaningful signal. To this end, we remove "fluff" from the text. I loosely define "fluff" as:
1. text that appears in the same context across most of the notes (e.g. "sarasota memorial hospital")
2. text that our model has no good way to handle (e.g. "50 mg" - our model doesn't understand numbers)

In the first case, we simply remove the string altogether.
In the second case, we replace with a string - e.g. "50 mg" becomes "quantity".
 
Approach:
1. Find dates in the text, replace them with the string 'DATE'
2. Find and remove fluff expressions with 'DATE' in them, e.g. 'Date of admission DATE', 'd: DATE t: DATE'
3. Find quantities in the text, replace them with the string 'QUANTITY'
4. Find frequency instructions ('per day') in the text, replace them with the string 'FREQUENCY'.
5. Find and remove fluff like 'sarasota memorial hospital' or 'admitting diagnosis'.  
6. Find and remove other expressions ('visit id XXXXXX', ')

Note that this necessarily occurs after splitting text into sections; this code would strip away section headers.

## Train/test split
Once text prep is done, we do a train/test split, and write train/test data to separate data files.

# Clean text

In [None]:
# This expects data written out by the notebook `TextSections`.
DATA_PATH     = ""
WRITE_DATA_TO = ""

# Column names
OUTCOME   = "ReadmissionInLessThan30Days"
VISIT_ID  = "ChartGUID"
SEC_NAMES = ['abdomen', 'activity', 'admission diagnoses', 'allergies', 'chest',
             'complications', 'consultations', 'core measures', 'course',
             'discharge condition', 'discharge diagnoses', 'discharge diet',
             'discharge medications', 'disposition', 'extremities', 'general',
             'heart', 'heent', 'history', 'laboratory data', 'lungs', 'neck',
             'neurologic', 'physical examination', 'present illness', 'procedures',
              'procedures performed', 'prognosis', 'social history', 'vital signs']

dataframe = pd.read_csv(DATA_PATH)

In [None]:
##################################################################################
# Dates
##################################################################################
# Process dates
MONTH   = "[jJ]anuary|[fF]ebruary|[mM]arch|[aA]pril|[mM]ay|[jJ]une|"
MONTH  += "[jJ]uly|[aA]ugust|[sS]eptember|[oO]ctober|[nN]ovember|[dD]ecember"
MONTH   = "(" + MONTH + "|[jJ]an|[fF]eb|[mM]ar|[aA]pr|[jJ]un|[jJ]ul|[aA]ug|[sS]ept|[oO]ct|[nN]ov|[dD]ec)"
DATENOY = "(" + MONTH + "\s)(\d+)"                            # june 5
DATEWDS = "((" + MONTH + "\s)(\d+)(\s|\,)\s(\d+))"            # june 5, 1995
DATEWDS_EU = "((\d+\s*)" + MONTH + "(\s|\,)\s(\d+))"          # 5 june, 1995
DATENMS = "((\d{1,2}(\/|-))+\d{1,2}(\/\s?|-)\d{2,4})"            # 06/05/1995, 06-05-1995
DATE    = "(" + DATEWDS + "|" + DATEWDS_EU + "|" + DATENMS + "|" + DATENOY + ")"



##################################################################################
# Numbers
##################################################################################
NUMBER = "((\d+)(\.|\/)(\d+))|(\d+)"

NUMBER = NUMBER + "|(zero)|(one)|(two)|(three)|(four)|(five)|(six)|(seven)|(eight)|(nine)" +"|(ten)|(eleven)|(twelve)|(thirteen)|(fourteen)|(fifteen)|(sixteen)|(seventeen)|(eighteen)|" +"(nineteen)|(twenty)|(thirty)|(forty)|(fifty)|(sixty)|(seventy)|(eighty)|(ninety)|(hundred)"
NUMBER = "((\s)|(\())(" + NUMBER + ")((\s)|(\.)|(\,)|(%))"

##################################################################################
# TCU/CRU - what to do?
##################################################################################
TCU = '((transitional care unit)|(tcu))'
CRU = '((cru)|(comprehensive rehab unit)|(comprehensive rehabilitation unit))'

CRU_OR_TCU = re.compile("(" + TCU + "|" + CRU + ")")

################################################################
# Admitted/Discharged
################################################################
# Deal with header, e.g. "date of admission:", "date of discharge"
ADM_PATTERN_EMPTYDATE = '((((date of admission)|(admission date))(:?))|(admitted:))(\s?)'
ADM_PATTERN           = re.compile(ADM_PATTERN_EMPTYDATE + "DATE")
ADM_PATTERN_EMPTYDATE = re.compile(ADM_PATTERN_EMPTYDATE)

DCH_PATTERN_EMPTYDATE = '((date of discharge)|(discharge date)|(discharged))(:?)(\s?)'
DCH_PATTERN           = re.compile(DCH_PATTERN_EMPTYDATE + "DATE")
DCH_PATTERN_EMPTYDATE = re.compile(DCH_PATTERN_EMPTYDATE)

################################################################
# SMH/Discharge summary
################################################################
SMH_FLUFF     = re.compile("(sarasota memorial hospital - sarasota, fl)|(sarasota memorial hospital)|(smh hospitalist program)")
DCH_FLUFF     = re.compile("(discharge summary)")
END_FLUFF     = re.compile("\[signature\]")
TIME          = re.compile("(\d{1,2}:\d{2}\s[aApP][mM])|(\d{1,2}:\d{2}(:\d{2})?)")

################################################################
# Admitting diagnosis
################################################################
DIAG          = "((diagnosis)|(diagnoses))"
DIAG          = re.compile("((((admit)|(admitting)|(final discharge)|(final)|(discharge)|(primary)|(admission)|(principal)|(principle))\s" + DIAG + ")|(" + DIAG + "))(:?)")
PTVISIT       = re.compile("((patient visit)|(patient))\s((identification)|(id))\s((number)|#)(:?)((\s#)?)(\s?)(\d+)")
#PROBABLY_GUID = re.compile("\s\d{7}\s")
GUID          = re.compile(
                    "((inpatient)((-|:)?)(\s?)(\d{7}\s))|" + \
                    "((mrn:\s)(\d{7}))|" + \
                    "((account|confirmation|jobid)(:?)(\s?)(\d{6,12}))|"+\
                    "\s\d{7}\s"
                )

################################################################
# Dictated/transcribed
################################################################
DICT_TRANS    = re.compile("(d|t):(\s?)DATE")
DICT_TRANS_BY = re.compile("((dictated by)|(transcribed by))(\s?)")

################################################################
# Patient name
################################################################
PT_NAME    = re.compile("(patient name:)(\s?)(\w+),(\s?)(\w+)")

################################################################
# Case #/history #
################################################################
CASE_HISTORY = re.compile("((case)|(history))(\s?)(#|number|num)")

##################################################################################
# Med instructions
##################################################################################
#NUMBER    = "(\d+)(\.?)(\d*)|(one)|(two)|(three)|(four)|(five)|(six)|(seven)|(eight)|(nine)|(ten)"
AMOUNT    = re.compile("(NUMBER)" + "(\s?)((micrograms)|(microgram)|(mgs)|(mg)|(milligrams)|(milligram)|(milliequivalents))")  
FREQUENCY = re.compile("((by mouth)|(q\sd)|(on odd days)|(times a day)|(times per day)|(at dinner)|(at breakfast)|(at bedtime)|(q\. day)|(q d\.)|(q a\.m\.)|(q p\.m\.)|(q am)|(q pm)|(q day)|(q\.day)|(q\.hs)|(meq)|(q\.d\.)|(b\.i\.d\.)|(t\.i\.d\.)|(q\.i\.d)|(q\dh)|(p\.r\.n\.)|(p\.o\.)|(daily))") # http://www.medicinenet.com/script/main/art.asp?articlekey=6954
INSTRUCT  = re.compile("((as directed)|(\spo\s)|(\sp\.o\.\s))") # http://www.medicinenet.com/script/main/art.asp?articlekey=6954

##################################################################################
# Text lists (1. XXX, 2. YYY, 3. ZZZ...)
##################################################################################
TEXT_LIST = "(1\..+?[a-z])\. ([a-z])"
LIST_START = " 1\.\s"
LIST_DELIM = "\. [1-9]{1,2}\. "

##################################################################################
# Process!
##################################################################################
def encode_dates(x):
    x = re.sub(DATE, "DATE", x)
    return(x)

def remove_admission_and_discharge(x):
    x = re.sub(ADM_PATTERN, "", x)
    x = re.sub(ADM_PATTERN_EMPTYDATE, "", x)
    x = re.sub(DCH_PATTERN, "", x)
    x = re.sub(DCH_PATTERN_EMPTYDATE, "", x)
    return(x)

def remove_quantity(x):
    x = re.sub(AMOUNT, " QUANTITY", x)
    return(x)

def remove_instruction(x):
    x = re.sub(INSTRUCT, " INSTRUCTION ", x)
    return(x)

def remove_frequency(x):
    x = re.sub(FREQUENCY, "FREQUENCY", x)
    return(x)

NUMBER = re.compile(NUMBER)
def encode_numbers(x):
    x = re.sub(NUMBER, " NUMBER ", x)
    return(x)

def remove_fluff(x):
    x = re.sub(DCH_FLUFF, "", re.sub(SMH_FLUFF, "", x))
    x = re.sub(END_FLUFF, "", x)
    x = re.sub(TIME, "", x)
    x = re.sub(DIAG, "DIAGNOSIS", x)
    x = re.sub(DICT_TRANS, "", x)
    x = re.sub(DICT_TRANS_BY, "", x)
    x = re.sub(CASE_HISTORY, "", x)
    x = re.sub(PTVISIT, "", x)
    x = re.sub(GUID, "", x)
    x = re.sub(PT_NAME, "", x)
    return(x)

def remove_end_of_note(x):
    x = re.sub("(DATE ____)(\w+)", "", x)
    x = re.sub("(cc:)(.+)", "", x)
    return(x)

def encode_lists(x):
    x = re.sub(TEXT_LIST, r"\1 ENDLIST \2", x)
    x = re.sub(LIST_START, " STARTLIST ", x)
    x = re.sub(LIST_DELIM, " COMMA ", x)
    return(x)

def remove_midsentence_periods(x):
    x = re.sub(" dr\. ", " dr ", x)
    x = re.sub(" mr\. ", " mr ", x)
    x = re.sub(" ms\. ", " ms ", x)
    x = re.sub(" mrs\. ", " mrs ", x)
    x = re.sub(" m\.d\. ", " md ", x)
    x = re.sub(" [a-z]\. ", " ", x) # removing name abbrev
    return(x)

def encode_end_of_sentences(x):
    # Make sure to cull non sentence ending periods first
    x = re.sub("(\.\s)|(ENDLIST\s)", " STOP ", x)
    return(x)

STOPWORDS = "(\s((a)|(an)|(the)|(at))\s)|(\s\s)"

import time
def process_text(x):
    x = str(x)
    x = encode_dates(x)
    x = remove_admission_and_discharge(x)
    x = remove_fluff(x)
    x = remove_frequency(x)
    x = remove_instruction(x)
    x = encode_lists(x)
    x = encode_numbers(x) 
    x = remove_quantity(x)
    x = re.sub(CRU_OR_TCU, "", x)
    x = remove_midsentence_periods(x)
    x = remove_end_of_note(x)
    x = re.sub("(QUANTITY)|(FREQUENCY)|(INSTRUCTION)|(NUMBER)|(DATE)|(COMMA)", "", x)
    x = encode_end_of_sentences(x)
    x = re.sub(STOPWORDS, " ", x)
    return(x)

In [None]:
for section in SEC_NAMES:
    text[section] = text[section].apply(process_text)

In [None]:
textSections.to_csv(WRITE_DATA_TO + ".csv", index=False)

# Train/test split

In [None]:
train, test = train_test_split(textSections, stratify = textSections[OUTCOME])

In [None]:
train.to_csv(WRITE_DATA_TO + "_train.csv", index=False)
test.to_csv(WRITE_DATA_TO + "_train.csv", index=False)