NLP_CRT / Git / [8d2107] /extract

Models:
philipB/
NLP_CRT
Downloads: 1
[8d2107]: / extract_data.py
History
Download this file
242 lines (222 with data), 11.6 kB

from language_processing import parse_m_d_y, parse_m_y, parse_date, format_date, split_sentences
import re
import datetime
from datetime import timedelta

def get_operation_date(patient_data):
    if patient_data['Procedure']:
        status = patient_data['Procedure']['Implant Status']
        match = parse_m_d_y(status)
        '''
        if not match:
           match = parse_m_y(status)
           oneMo = parse_m_d_y(patient_data['Procedure']['1 Mo. Appt'])
           threeMo = parse_m_d_y(patient_data['Procedure']['3 Mo. Appt'])
        
        #checks if match makes sentence given the 1 month and 3 month checkups
        if match:
            if oneMo and threeMo:    
                lower_range = max(threeMo - timedelta(days = 30*24), oneMo - timedelta(days = 30*6))
                upper_range = min(threeMo - timedelta(days = 30*2), oneMo) 
            elif not oneMo and threeMo:
                lower_range = threeMo - timedelta(days = 30*24)
                upper_range = threeMo - timedelta(days = 30*2) 
            elif oneMo and not threeMo:
                lower_range = oneMo - timedelta(days = 30*6)
                upper_range = oneMo
            else:
                lower_range = datetime.date(datetime.MINYEAR, 1, 1)
                upper_range = datetime.date(datetime.MAXYEAR, 1, 1)
            if lower_range > match or upper_range < match:
                match = None

        #if no good date found, subtrack from one month and three month checkups
        if not match:
            if oneMo:
                match = oneMo - timedelta(days = 30)
            elif threeMo:
                match = threeMo - timedelta(days = 90)
            else:
                return None
        
        '''
        return match
    else:
        return None


def get_doc_rel_dates(patient_data, dates = None, count_elements = True):
    if dates == None:
        dates = dict()
    procedure_date = get_operation_date(patient_data)
    if procedure_date == None:
        return dates 
    else: 
        for doc_type in patient_data:
            date_key = get_date_key(doc_type)
            if date_key != None:
                docs = patient_data[doc_type]
                if type(docs) != type(list()):
                    docs = [docs]
                for doc in docs:
                    if doc != None:
                        date = parse_date(doc[date_key])
                        if date != None:
                            multiplier = 1
                            if count_elements:
                                if is_note_doc(doc_type):
                                    multiplier = len(split_sentences(doc['free_text']))
                                else:
                                    multiplier = len(doc.keys())
                            if doc_type in dates:
                                dates[doc_type] += [date - procedure_date]*multiplier
                            else:
                                dates[doc_type] = [date - procedure_date]*multiplier
    return dates


'''
description
    returns a list of (date, EF_value) tuples generated from the notes of a patient from get_data([i])
inputs
    patient_data: dictionary of docs as returned by get_data([i])[0]
    car_only: boolean flag that only looks at Car notes
output
    list of (date, EF_value) tuples that can be post-processed into counts, a plot of estimated EF, or estimate for EF prefore and after delta for procedure
    the date is 0-centered around procedure date
''' 
def get_ef_values(patient_data, car_only = True): 
    keywords = ['ef[{of}{0, 1}: \t]*([0-9]*\.{0,1}[0-9]*)[ ]*%', 'ejection fraction[{of}{0, 1}: \t]*([0-9]*\.{0,1}[0-9]*)[ ]*%']
    keywords = ['(?:ef|ejection fraction)\s*(?:of|is)?[:\s]*([0-9]*\.?[0-9]*)\s*%']
    results = []
    procedure_date = get_operation_date(patient_data)
    if procedure_date == None: #throw out patient if no procedure date
        return results
    else: 
        for doc_type in patient_data: #loop over each doc type, eg Enc, Lno
            if is_note_doc(doc_type) and (not car_only or doc_type == 'Car'): #only look at note docs, eg Car, Lno
                date_key = get_date_key(doc_type)
                if date_key != None: #only look at notes with a date key provided (should be all of them)
                    docs = patient_data[doc_type]
                    if type(docs) != type(list()): #just in case the value is not a list, make it one so we can iterate over it
                        docs = [docs]
                    for doc in docs: #for each document of that type for a given patient
                        if doc != None: #assuming the list is not empty
                            date = parse_date(doc[date_key]) #this stores the date of the note
                            if date != None: #if there is a date value
                                note = doc['free_text'].lower() #get the note raw_text
                                delta_days = (date - procedure_date).days
                                ##### MODIFY THIS PART ###### -- Has been modified by mtraub
                                for key in keywords: #for each keyword, search over the document and get matched MODIFY THIS
                                    pattern = re.compile(key)
                                    results += [ (delta_days, float(x)) for x in re.findall(pattern, note) if len(x) > 0 and x != "."]
    return results

'''
ADDED BY JOSH TO VALIDATE
'''
def get_ef_value_notes(patient_data, car_only = True):

    keywords = ['ef[{of}{0, 1}: \t]*([0-9]*\.{0,1}[0-9]*)[ ]*%', 'ejection fraction[{of}{0, 1}: \t]*([0-9]*\.{0,1}[0-9]*)[ ]*%']
    keywords = ['(?:ef|ejection fraction)\s*(?:of|is)?[:\s]*([0-9]*\.?[0-9]*)\s*%']
    results = []
    procedure_date = get_operation_date(patient_data)
    if procedure_date == None: #throw out patient if no procedure date
        return results
    else: 
        for doc_type in patient_data: #loop over each doc type, eg Enc, Lno
            if is_note_doc(doc_type) and (not car_only or doc_type == 'Car'): #only look at note docs, eg Car, Lno
                date_key = get_date_key(doc_type)
                if date_key != None: #only look at notes with a date key provided (should be all of them)
                    docs = patient_data[doc_type]
                    if type(docs) != type(list()): #just in case the value is not a list, make it one so we can iterate over it
                        docs = [docs]
                    for doc in docs: #for each document of that type for a given patient
                        if doc != None: #assuming the list is not empty
                            date = parse_date(doc[date_key]) #this stores the date of the note
                            if date != None: #if there is a date value
                                note = doc['free_text'].lower() #get the note raw_text
                                delta_days = (date - procedure_date).days
                                ##### MODIFY THIS PART ###### -- Has been modified by mtraub
                                for key in keywords: #for each keyword, search over the document and get matched MODIFY THIS
                                    pattern = re.compile(key)
                                    results += [ (delta_days, float(x), note) for x in re.findall(pattern, note) if len(x) > 0 and x != "."]
    return results



def get_doc_keywords(patient_data, keywords, counts = None, by_doc_type = False):
    if counts == None:
        counts = dict()
    for doc_type in patient_data:
        if is_note_doc(doc_type):
            docs = patient_data[doc_type]
            for doc in docs:        
                note = doc['free_text'].lower()
                for key in keywords:
                    pattern = re.compile(key)
                    if by_doc_type:
                        if not doc_type in counts:
                            counts[doc_type] = dict()
                        if key in counts[doc_type]:
                            counts[doc_type][key] += [len(re.findall(pattern, note))]
                        else:
                            counts[doc_type][key] = [len(re.findall(pattern, note))]
                    else:
                        if key in counts:
                            counts[key] += [len(re.findall(pattern, note))]
                        else:
                            counts[key] = [len(re.findall(pattern, note))]
                    
    return counts

def get_doc_rel_dates(patient_data, dates = None, count_elements = True):
    if dates == None:
        dates = dict()
    procedure_date = get_operation_date(patient_data)
    if procedure_date == None:
        return dates 
    else: 
        for doc_type in patient_data:
            date_key = get_date_key(doc_type)
            if date_key != None:
                docs = patient_data[doc_type]
                if type(docs) != type(list()):
                    docs = [docs]
                for doc in docs:
                    if doc != None:
                        date = parse_date(doc[date_key])
                        if date != None:
                            multiplier = 1
                            if count_elements:
                                if is_note_doc(doc_type):
                                    multiplier = len(split_sentences(doc['free_text']))
                                else:
                                    multiplier = len(doc.keys())
                            if doc_type in dates:
                                dates[doc_type] += [date - procedure_date]*multiplier
                            else:
                                dates[doc_type] = [date - procedure_date]*multiplier
    return dates

def is_note_doc(doc_type):
    return doc_type.upper() in ["LNO", "CAR", "RAD", "PAT", "OPN", "DIS", "MIC", "PUL"]

def get_date_key(doc_type):
    keys = {u'Enc': u'Discharge_Date', u'Pat': u'date', u'Mic': u'date', u'Pul': u'date', u'Med': u'Medication_Date', u'Lab': u'Seq_Date_Time', u'Phy': u'Date', u'Opn': u'date', u'Lme': u'LMR_Medication_Date_Time', u'Rdt': u'Date', u'Lvs': u'LMR_Vital_Date_Time', u'Trn': u'Transaction_Date_Time', u'Car': u'Report_Date_Time', u'Lhm': u'LMR_Health_Maintenance_Date_Time', u'Dia': u'Date', u'Lpr': u'LMR_Problem_Date', u'Dis': u'date', u'Rad': u'Report_Date_Time', u'Prc': u'Date', u'Lno': u'LMRNote_Date'}
    if doc_type in keys:
        return keys[doc_type]
    else:
        return None

'''
description
    parses the notes header
input
    header_string: the first line of the notes document
output
    dictionary of values with keys ['date', 'doctor', 'hospital']
'''
def parse_note_header(head_string, doc_type):
    if not is_note_doc(doc_type):
        return dict()
    result = {'Date' : None, 'Doctor' : None, 'Hospital' : None, 'Procedure' : None}
    head_split = head_string.split("|")
    #print head_split
    result['Hospital'] = head_split[1]
    doc_type = doc_type.upper()
    if doc_type == "LNO":    
        result['Date'] = head_split[3].split()[0]
        result['Doctor'] = head_split[6]
        result['Procedure'] = head_split[10]
    elif doc_type in [ "DIS" , "CAR",  "RAD" , "PAT" , "OPN" ]:
        result['Date'] = head_split[5].split()[0]
        result['Procedure'] = head_split[6]
    elif doc_type in [  "MIC" , "PUL" ]:
        result['Date'] = head_split[4].split()[0]
        result['Procedure'] = head_split[5]
    
    if result['Date'] != None:
        result['Date'] = format_date(result['Date'])    
    return result