a b/structured_data_extractor.py
1
import sys
2
import loader
3
import extract_data
4
import numpy as np
5
import build_graphs
6
7
def get_supplemental_details(field_name):
8
    """Takes in the name of a field and prints how many of the patients have that field.
9
    Note that the field must be a top-level field (i.e. 'Car', 'Lno', etc.).
10
    This was used to test how many patients had the 'Supplemental' field."""
11
    total = 0
12
    field_count = 0
13
    for i in range(907):
14
        try:
15
            p = loader.get_patient_by_EMPI("FAKE_EMPI_" + str(i))
16
            if field_name in p.keys():
17
                if p[field_name] != None:
18
                    print(str(i) + ": " + str(len(p[field_name])))
19
                    field_count += 1
20
                else:
21
                    print(str(i) + ": " + str(0))
22
            total += 1
23
        except Exception as e:
24
            print(str(i) + " DOES NOT EXIST")
25
            continue
26
    print("RESULTS: " + str(field_count) + "/" + str(total))
27
28
def get_diagnoses(empi):
29
    """Given an empi, will the return the diagnosis timeline T for that patient.
30
    T is just an array of tuples of the form (diagnosis date, Code_Type, code, diagnosis name),
31
    sorted by date. Note that a given date may, and often does, have several diagnoses.  Also,
32
    a diagnosis can be repeatedly reported on every visit."""
33
    p = loader.get_patient_by_EMPI(empi)
34
    diagnoses = [] 
35
    if 'Dia' in p.keys():
36
        for dia in p['Dia']:
37
            diagnoses.append((extract_data.parse_date(dia['Date']), dia['Code_Type'], dia['Code'], dia['Diagnosis_Name']))
38
        diagnoses.sort()
39
    return diagnoses
40
41
def get_date_to_diagnoses(empi):
42
    """Given an empi, returns a map from dates to lists of diagnoses
43
    for the patient at those dates"""
44
    diagnoses = get_diagnoses(empi)
45
    date_to_diagnoses = {}
46
    for d in diagnoses:
47
        date = d[0]
48
        if date in date_to_diagnoses:
49
            date_to_diagnoses[date].append(d[1:])
50
        else:
51
            date_to_diagnoses[date] = [d[1:]]
52
    return date_to_diagnoses
53
54
def get_diagnosis_to_dates(empi):
55
    """Given an empi, returns a map from diagnoses to lists of
56
    dates that the diagnoisis was recorded at for the patient.
57
    Note that diagnoses are tuples of the form (ICD9, name)."""
58
    diagnoses = get_diagnoses(empi)
59
    diagnosis_to_dates = {}
60
    for d in diagnoses:
61
        diagnosis = d[1:]
62
        if diagnosis in diagnosis_to_dates:
63
            diagnosis_to_dates[diagnosis].append(d[0])
64
        else:
65
            diagnosis_to_dates[diagnosis] = [d[0]]
66
    return diagnosis_to_dates
67
68
def get_chronic_diagnoses(empi, threshold_days):
69
    """Given an empi and a threshold number of days, returns a list
70
    of diagnoses for the patient that were recorded multiple times and
71
    at least threshold_days apart."""
72
    diagnosis_to_dates = get_diagnosis_to_dates(empi)
73
    threshold = threshold_days * 24 * 60 * 60
74
    chronics = []
75
    for diagnosis in diagnosis_to_dates:            
76
        first_date = diagnosis_to_dates[diagnosis][0]
77
        last_date = diagnosis_to_dates[diagnosis][-1]
78
        if (last_date - first_date).total_seconds() > threshold:
79
            chronics.append(diagnosis)
80
    return chronics 
81
82
def get_encounters_details(empi):
83
    """Used in testing the Enc field to understand what subfields exist and what values they take"""
84
    p = loader.get_patient_by_EMPI(empi)
85
    interesting_fields = ['Admit_Date', 'Inpatient_Outpatient', 'Discharge_Date', 'LOS_Days', 'DRG']
86
    for enc in p['Enc']:
87
        print('ENCOUNTER ' + enc['Encounter_number'] + ':')
88
        for field in interesting_fields:
89
            if enc[field]:
90
                print(field + ' = ' + str(enc[field]))
91
        extra_diagnoses = 0
92
        for i in range(1, 10):
93
            if enc['Diagnosis_' + str(i)]:
94
                extra_diagnoses += 1
95
        print('Extra Diagnoses = ' + str(extra_diagnoses))
96
        print('')
97
    ins = 0
98
    outs = 0
99
    for enc in p['Enc']:
100
        if enc['Inpatient_Outpatient'] == 'Inpatient':
101
            ins += 1
102
        else:
103
            outs += 1
104
    print(str(ins) + ' Inpatients')
105
    print(str(outs) + ' Outpatients')
106
107
def get_encounters(empi):
108
    """Given an empi, returns a list of encounters for that patient
109
    sorted by Admit Date (since Discharge Date is not always recorded)."""
110
    p = loader.get_patient_by_EMPI(empi)
111
    encounters = []
112
    if 'Enc' in p.keys():
113
        for enc in p['Enc']:
114
            extra_diagnoses = 0
115
            for i in range(1, 10):
116
                if enc['Diagnosis_' + str(i)]:
117
                    extra_diagnoses += 1
118
            if enc['Admit_Date']:
119
                encounters.append((extract_data.parse_date(enc['Admit_Date']), str(enc['Inpatient_Outpatient']), extract_data.parse_date(enc['Discharge_Date']), int(enc['LOS_Days']) if enc['LOS_Days'] else 0, extra_diagnoses))
120
        encounters.sort(key = lambda x: x[0]) # just sort on Admit_Date
121
    return encounters
122
123
def get_labs_before_date(empi, date):
124
    """Given an empi and a date, will return the labs for that patient before that date.
125
    Specifically, will return four dictionaries where the key is always the lab group id
126
    and the values are the total counts, low counts, high counts, and latest (date, low/high) tuple for 
127
    that test respectively. Note that low and high mean the test value was below or above the norm respectively."""
128
    p = loader.get_patient_by_EMPI(empi)
129
    lab_counts = {}
130
    lab_lows = {}
131
    lab_highs = {}
132
    lab_latest = {}
133
    if 'Lab' in p.keys():
134
        for lab in p['Lab']:
135
            if lab['Seq_Date_Time'] and extract_data.parse_date(lab['Seq_Date_Time']) < date: 
136
                if lab['Group_Id'] in lab_counts:
137
                    lab_counts[lab['Group_Id']] += 1
138
                else:
139
                    lab_counts[lab['Group_Id']] = 1
140
                lab_date = extract_data.parse_date(lab['Seq_Date_Time'])
141
                if lab['Group_Id'] in lab_latest:
142
                    recorded_test_date = lab_latest[lab['Group_Id']][0]
143
                    if lab_date > recorded_test_date: # keep most recent test value
144
                        lab_latest[lab['Group_Id']] = (lab_date, lab['Abnormal_Flag'])
145
                else:
146
                    lab_latest[lab['Group_Id']] = (lab_date, lab['Abnormal_Flag'])
147
                if lab['Abnormal_Flag']:
148
                    if lab['Abnormal_Flag'] == 'L':
149
                        if lab['Group_Id'] in lab_lows:
150
                            lab_lows[lab['Group_Id']] += 1
151
                        else:
152
                            lab_lows[lab['Group_Id']] = 1
153
                    elif lab['Abnormal_Flag'] == 'H':
154
                        if lab['Group_Id'] in lab_highs:
155
                            lab_highs[lab['Group_Id']] += 1
156
                        else:
157
                            lab_highs[lab['Group_Id']] = 1
158
    return lab_counts, lab_lows, lab_highs, lab_latest
159
160
def get_recent_lab_values(empi, date):
161
    p = loader.get_patient_by_EMPI(empi)
162
    lab_latest = {}
163
    if 'Lab' in p.keys():
164
        for lab in p['Lab']:
165
            if lab['Seq_Date_Time'] and extract_data.parse_date(lab['Seq_Date_Time']) < date: 
166
                lab_date = extract_data.parse_date(lab['Seq_Date_Time'])
167
                if lab['Group_Id'] in lab_latest:
168
                    recorded_test_date = lab_latest[lab['Group_Id']][0]
169
                    if lab_date > recorded_test_date: # keep most recent test value
170
                        lab_latest[lab['Group_Id']] = (lab_date, lab['Result'])
171
                else:
172
                    lab_latest[lab['Group_Id']] = (lab_date, lab['Result'])
173
    return lab_latest
174
175
176
177
def get_lab_history_before_date(empi, date, time_thresholds_months):
178
    """Given an empi and a date, will return a summarized history of the labs for that patient
179
    before the date.  Specifically, will return a dictionary where the key is a lab group id and
180
    the value is a list of size len(time_threshold_months) where each index represents whether the lab was mostly high or low
181
    in the threshold times set it time_thresholds_months.  For example, if we have 'BUN' => ['H', None, 'L'],
182
    then this indicates a transition from low (L) to high (H) leading up to the indicated date."""
183
    p = loader.get_patient_by_EMPI(empi)
184
    lab_history_counts = {}
185
    """
186
    lab_history_counts is 2-D array
187
    first dimension = time period
188
    second dimension = counts of 'H', 'L', and None
189
    example = [[15, 1, 2], ...] means in the past 1 month, 'H' was most (15 times)
190
    """
191
    seconds_in_month = 365 * 24 * 60 * 60 / 12
192
    values = ['H', 'L', None]
193
    if 'Lab' in p.keys():
194
        for lab in p['Lab']:
195
            if lab['Seq_Date_Time'] and extract_data.parse_date(lab['Seq_Date_Time']) < date:
196
                lab_date = extract_data.parse_date(lab['Seq_Date_Time'])
197
                value = lab['Abnormal_Flag'] if lab['Abnormal_Flag'] in ['H', 'L'] else None
198
                value_index = values.index(value)
199
                time_index = 0
200
                while time_index < len(time_thresholds_months) and (date - lab_date).total_seconds() > (time_thresholds_months[time_index] * seconds_in_month):
201
                    time_index += 1
202
                if time_index >= len(time_thresholds_months):
203
                    continue
204
                if lab['Group_Id'] not in lab_history_counts:
205
                    lab_history_counts[lab['Group_Id']] = np.zeros([len(time_thresholds_months), len(values)])
206
                lab_history_counts[lab['Group_Id']][time_index][value_index] += 1
207
    lab_history = {}
208
    for lab_name in lab_history_counts:
209
        lab_history[lab_name] = [None] * len(time_thresholds_months)
210
        for i in range(len(time_thresholds_months)):
211
            lab_history[lab_name][i] = values[lab_history_counts[lab_name][i].argmax()]
212
    return lab_history                  
213
214
if __name__ == "__main__":
215
    # get_supplemental_details('Supplemental')
216
    command = sys.argv[1]
217
    empi = sys.argv[2]
218
    if command == 'diagnosis': 
219
        diagnoses = get_diagnoses(empi)
220
        for d in diagnoses:
221
            print(d)
222
        date_to_diagnoses = get_date_to_diagnoses(empi)
223
        chronic_diagnoses = get_chronic_diagnoses(empi, 90)
224
        start_date = diagnoses[0][0]
225
        end_date = diagnoses[-1][0]
226
        print("~~~~~~~~~~~~~~~~")
227
        print("Start Date: " + str(start_date))
228
        print("End Date: " + str(end_date))
229
        print("Num. of Entries: " + str(len(diagnoses)))
230
        print("Num. of Visits: " + str(len(date_to_diagnoses)))
231
        # print("Chronic Diagnoses: " + str(chronic_diagnoses))
232
    elif command == 'encounter':
233
        encounters = get_encounters(empi)
234
        for enc in encounters:
235
            print(enc)
236
        #get_encounters_details(empi)
237
    elif command == 'labs':
238
        """
239
        lab_counts, lab_lows, lab_highs, lab_latest = get_labs_before_date(empi, extract_data.parse_date('11/16/2015'))
240
        for lab in lab_counts:
241
            print(lab)
242
            print('COUNT: ' + str(lab_counts[lab]))
243
            print('LOWS: ' + str(lab_lows[lab]) if lab in lab_lows else 'LOWS: 0')
244
            print('HIGHS: ' + str(lab_highs[lab]) if lab in lab_highs else 'HIGHS: 0')
245
            print('LATEST: ' + str(lab_latest[lab]))
246
            print('')
247
        """
248
        operation_date = build_graphs.get_operation_date(loader.get_patient_by_EMPI(empi))
249
        lab_values = get_recent_lab_values(empi, operation_date)
250
        for lab in lab_values:
251
            print(str(lab) + ": " + str(lab_values[lab]))