|
a |
|
b/structured_data_extractor.py |
|
|
1 |
import sys |
|
|
2 |
import loader |
|
|
3 |
import extract_data |
|
|
4 |
import numpy as np |
|
|
5 |
import build_graphs |
|
|
6 |
|
|
|
7 |
def get_supplemental_details(field_name): |
|
|
8 |
"""Takes in the name of a field and prints how many of the patients have that field. |
|
|
9 |
Note that the field must be a top-level field (i.e. 'Car', 'Lno', etc.). |
|
|
10 |
This was used to test how many patients had the 'Supplemental' field.""" |
|
|
11 |
total = 0 |
|
|
12 |
field_count = 0 |
|
|
13 |
for i in range(907): |
|
|
14 |
try: |
|
|
15 |
p = loader.get_patient_by_EMPI("FAKE_EMPI_" + str(i)) |
|
|
16 |
if field_name in p.keys(): |
|
|
17 |
if p[field_name] != None: |
|
|
18 |
print(str(i) + ": " + str(len(p[field_name]))) |
|
|
19 |
field_count += 1 |
|
|
20 |
else: |
|
|
21 |
print(str(i) + ": " + str(0)) |
|
|
22 |
total += 1 |
|
|
23 |
except Exception as e: |
|
|
24 |
print(str(i) + " DOES NOT EXIST") |
|
|
25 |
continue |
|
|
26 |
print("RESULTS: " + str(field_count) + "/" + str(total)) |
|
|
27 |
|
|
|
28 |
def get_diagnoses(empi): |
|
|
29 |
"""Given an empi, will the return the diagnosis timeline T for that patient. |
|
|
30 |
T is just an array of tuples of the form (diagnosis date, Code_Type, code, diagnosis name), |
|
|
31 |
sorted by date. Note that a given date may, and often does, have several diagnoses. Also, |
|
|
32 |
a diagnosis can be repeatedly reported on every visit.""" |
|
|
33 |
p = loader.get_patient_by_EMPI(empi) |
|
|
34 |
diagnoses = [] |
|
|
35 |
if 'Dia' in p.keys(): |
|
|
36 |
for dia in p['Dia']: |
|
|
37 |
diagnoses.append((extract_data.parse_date(dia['Date']), dia['Code_Type'], dia['Code'], dia['Diagnosis_Name'])) |
|
|
38 |
diagnoses.sort() |
|
|
39 |
return diagnoses |
|
|
40 |
|
|
|
41 |
def get_date_to_diagnoses(empi): |
|
|
42 |
"""Given an empi, returns a map from dates to lists of diagnoses |
|
|
43 |
for the patient at those dates""" |
|
|
44 |
diagnoses = get_diagnoses(empi) |
|
|
45 |
date_to_diagnoses = {} |
|
|
46 |
for d in diagnoses: |
|
|
47 |
date = d[0] |
|
|
48 |
if date in date_to_diagnoses: |
|
|
49 |
date_to_diagnoses[date].append(d[1:]) |
|
|
50 |
else: |
|
|
51 |
date_to_diagnoses[date] = [d[1:]] |
|
|
52 |
return date_to_diagnoses |
|
|
53 |
|
|
|
54 |
def get_diagnosis_to_dates(empi): |
|
|
55 |
"""Given an empi, returns a map from diagnoses to lists of |
|
|
56 |
dates that the diagnoisis was recorded at for the patient. |
|
|
57 |
Note that diagnoses are tuples of the form (ICD9, name).""" |
|
|
58 |
diagnoses = get_diagnoses(empi) |
|
|
59 |
diagnosis_to_dates = {} |
|
|
60 |
for d in diagnoses: |
|
|
61 |
diagnosis = d[1:] |
|
|
62 |
if diagnosis in diagnosis_to_dates: |
|
|
63 |
diagnosis_to_dates[diagnosis].append(d[0]) |
|
|
64 |
else: |
|
|
65 |
diagnosis_to_dates[diagnosis] = [d[0]] |
|
|
66 |
return diagnosis_to_dates |
|
|
67 |
|
|
|
68 |
def get_chronic_diagnoses(empi, threshold_days): |
|
|
69 |
"""Given an empi and a threshold number of days, returns a list |
|
|
70 |
of diagnoses for the patient that were recorded multiple times and |
|
|
71 |
at least threshold_days apart.""" |
|
|
72 |
diagnosis_to_dates = get_diagnosis_to_dates(empi) |
|
|
73 |
threshold = threshold_days * 24 * 60 * 60 |
|
|
74 |
chronics = [] |
|
|
75 |
for diagnosis in diagnosis_to_dates: |
|
|
76 |
first_date = diagnosis_to_dates[diagnosis][0] |
|
|
77 |
last_date = diagnosis_to_dates[diagnosis][-1] |
|
|
78 |
if (last_date - first_date).total_seconds() > threshold: |
|
|
79 |
chronics.append(diagnosis) |
|
|
80 |
return chronics |
|
|
81 |
|
|
|
82 |
def get_encounters_details(empi): |
|
|
83 |
"""Used in testing the Enc field to understand what subfields exist and what values they take""" |
|
|
84 |
p = loader.get_patient_by_EMPI(empi) |
|
|
85 |
interesting_fields = ['Admit_Date', 'Inpatient_Outpatient', 'Discharge_Date', 'LOS_Days', 'DRG'] |
|
|
86 |
for enc in p['Enc']: |
|
|
87 |
print('ENCOUNTER ' + enc['Encounter_number'] + ':') |
|
|
88 |
for field in interesting_fields: |
|
|
89 |
if enc[field]: |
|
|
90 |
print(field + ' = ' + str(enc[field])) |
|
|
91 |
extra_diagnoses = 0 |
|
|
92 |
for i in range(1, 10): |
|
|
93 |
if enc['Diagnosis_' + str(i)]: |
|
|
94 |
extra_diagnoses += 1 |
|
|
95 |
print('Extra Diagnoses = ' + str(extra_diagnoses)) |
|
|
96 |
print('') |
|
|
97 |
ins = 0 |
|
|
98 |
outs = 0 |
|
|
99 |
for enc in p['Enc']: |
|
|
100 |
if enc['Inpatient_Outpatient'] == 'Inpatient': |
|
|
101 |
ins += 1 |
|
|
102 |
else: |
|
|
103 |
outs += 1 |
|
|
104 |
print(str(ins) + ' Inpatients') |
|
|
105 |
print(str(outs) + ' Outpatients') |
|
|
106 |
|
|
|
107 |
def get_encounters(empi): |
|
|
108 |
"""Given an empi, returns a list of encounters for that patient |
|
|
109 |
sorted by Admit Date (since Discharge Date is not always recorded).""" |
|
|
110 |
p = loader.get_patient_by_EMPI(empi) |
|
|
111 |
encounters = [] |
|
|
112 |
if 'Enc' in p.keys(): |
|
|
113 |
for enc in p['Enc']: |
|
|
114 |
extra_diagnoses = 0 |
|
|
115 |
for i in range(1, 10): |
|
|
116 |
if enc['Diagnosis_' + str(i)]: |
|
|
117 |
extra_diagnoses += 1 |
|
|
118 |
if enc['Admit_Date']: |
|
|
119 |
encounters.append((extract_data.parse_date(enc['Admit_Date']), str(enc['Inpatient_Outpatient']), extract_data.parse_date(enc['Discharge_Date']), int(enc['LOS_Days']) if enc['LOS_Days'] else 0, extra_diagnoses)) |
|
|
120 |
encounters.sort(key = lambda x: x[0]) # just sort on Admit_Date |
|
|
121 |
return encounters |
|
|
122 |
|
|
|
123 |
def get_labs_before_date(empi, date): |
|
|
124 |
"""Given an empi and a date, will return the labs for that patient before that date. |
|
|
125 |
Specifically, will return four dictionaries where the key is always the lab group id |
|
|
126 |
and the values are the total counts, low counts, high counts, and latest (date, low/high) tuple for |
|
|
127 |
that test respectively. Note that low and high mean the test value was below or above the norm respectively.""" |
|
|
128 |
p = loader.get_patient_by_EMPI(empi) |
|
|
129 |
lab_counts = {} |
|
|
130 |
lab_lows = {} |
|
|
131 |
lab_highs = {} |
|
|
132 |
lab_latest = {} |
|
|
133 |
if 'Lab' in p.keys(): |
|
|
134 |
for lab in p['Lab']: |
|
|
135 |
if lab['Seq_Date_Time'] and extract_data.parse_date(lab['Seq_Date_Time']) < date: |
|
|
136 |
if lab['Group_Id'] in lab_counts: |
|
|
137 |
lab_counts[lab['Group_Id']] += 1 |
|
|
138 |
else: |
|
|
139 |
lab_counts[lab['Group_Id']] = 1 |
|
|
140 |
lab_date = extract_data.parse_date(lab['Seq_Date_Time']) |
|
|
141 |
if lab['Group_Id'] in lab_latest: |
|
|
142 |
recorded_test_date = lab_latest[lab['Group_Id']][0] |
|
|
143 |
if lab_date > recorded_test_date: # keep most recent test value |
|
|
144 |
lab_latest[lab['Group_Id']] = (lab_date, lab['Abnormal_Flag']) |
|
|
145 |
else: |
|
|
146 |
lab_latest[lab['Group_Id']] = (lab_date, lab['Abnormal_Flag']) |
|
|
147 |
if lab['Abnormal_Flag']: |
|
|
148 |
if lab['Abnormal_Flag'] == 'L': |
|
|
149 |
if lab['Group_Id'] in lab_lows: |
|
|
150 |
lab_lows[lab['Group_Id']] += 1 |
|
|
151 |
else: |
|
|
152 |
lab_lows[lab['Group_Id']] = 1 |
|
|
153 |
elif lab['Abnormal_Flag'] == 'H': |
|
|
154 |
if lab['Group_Id'] in lab_highs: |
|
|
155 |
lab_highs[lab['Group_Id']] += 1 |
|
|
156 |
else: |
|
|
157 |
lab_highs[lab['Group_Id']] = 1 |
|
|
158 |
return lab_counts, lab_lows, lab_highs, lab_latest |
|
|
159 |
|
|
|
160 |
def get_recent_lab_values(empi, date): |
|
|
161 |
p = loader.get_patient_by_EMPI(empi) |
|
|
162 |
lab_latest = {} |
|
|
163 |
if 'Lab' in p.keys(): |
|
|
164 |
for lab in p['Lab']: |
|
|
165 |
if lab['Seq_Date_Time'] and extract_data.parse_date(lab['Seq_Date_Time']) < date: |
|
|
166 |
lab_date = extract_data.parse_date(lab['Seq_Date_Time']) |
|
|
167 |
if lab['Group_Id'] in lab_latest: |
|
|
168 |
recorded_test_date = lab_latest[lab['Group_Id']][0] |
|
|
169 |
if lab_date > recorded_test_date: # keep most recent test value |
|
|
170 |
lab_latest[lab['Group_Id']] = (lab_date, lab['Result']) |
|
|
171 |
else: |
|
|
172 |
lab_latest[lab['Group_Id']] = (lab_date, lab['Result']) |
|
|
173 |
return lab_latest |
|
|
174 |
|
|
|
175 |
|
|
|
176 |
|
|
|
177 |
def get_lab_history_before_date(empi, date, time_thresholds_months): |
|
|
178 |
"""Given an empi and a date, will return a summarized history of the labs for that patient |
|
|
179 |
before the date. Specifically, will return a dictionary where the key is a lab group id and |
|
|
180 |
the value is a list of size len(time_threshold_months) where each index represents whether the lab was mostly high or low |
|
|
181 |
in the threshold times set it time_thresholds_months. For example, if we have 'BUN' => ['H', None, 'L'], |
|
|
182 |
then this indicates a transition from low (L) to high (H) leading up to the indicated date.""" |
|
|
183 |
p = loader.get_patient_by_EMPI(empi) |
|
|
184 |
lab_history_counts = {} |
|
|
185 |
""" |
|
|
186 |
lab_history_counts is 2-D array |
|
|
187 |
first dimension = time period |
|
|
188 |
second dimension = counts of 'H', 'L', and None |
|
|
189 |
example = [[15, 1, 2], ...] means in the past 1 month, 'H' was most (15 times) |
|
|
190 |
""" |
|
|
191 |
seconds_in_month = 365 * 24 * 60 * 60 / 12 |
|
|
192 |
values = ['H', 'L', None] |
|
|
193 |
if 'Lab' in p.keys(): |
|
|
194 |
for lab in p['Lab']: |
|
|
195 |
if lab['Seq_Date_Time'] and extract_data.parse_date(lab['Seq_Date_Time']) < date: |
|
|
196 |
lab_date = extract_data.parse_date(lab['Seq_Date_Time']) |
|
|
197 |
value = lab['Abnormal_Flag'] if lab['Abnormal_Flag'] in ['H', 'L'] else None |
|
|
198 |
value_index = values.index(value) |
|
|
199 |
time_index = 0 |
|
|
200 |
while time_index < len(time_thresholds_months) and (date - lab_date).total_seconds() > (time_thresholds_months[time_index] * seconds_in_month): |
|
|
201 |
time_index += 1 |
|
|
202 |
if time_index >= len(time_thresholds_months): |
|
|
203 |
continue |
|
|
204 |
if lab['Group_Id'] not in lab_history_counts: |
|
|
205 |
lab_history_counts[lab['Group_Id']] = np.zeros([len(time_thresholds_months), len(values)]) |
|
|
206 |
lab_history_counts[lab['Group_Id']][time_index][value_index] += 1 |
|
|
207 |
lab_history = {} |
|
|
208 |
for lab_name in lab_history_counts: |
|
|
209 |
lab_history[lab_name] = [None] * len(time_thresholds_months) |
|
|
210 |
for i in range(len(time_thresholds_months)): |
|
|
211 |
lab_history[lab_name][i] = values[lab_history_counts[lab_name][i].argmax()] |
|
|
212 |
return lab_history |
|
|
213 |
|
|
|
214 |
if __name__ == "__main__": |
|
|
215 |
# get_supplemental_details('Supplemental') |
|
|
216 |
command = sys.argv[1] |
|
|
217 |
empi = sys.argv[2] |
|
|
218 |
if command == 'diagnosis': |
|
|
219 |
diagnoses = get_diagnoses(empi) |
|
|
220 |
for d in diagnoses: |
|
|
221 |
print(d) |
|
|
222 |
date_to_diagnoses = get_date_to_diagnoses(empi) |
|
|
223 |
chronic_diagnoses = get_chronic_diagnoses(empi, 90) |
|
|
224 |
start_date = diagnoses[0][0] |
|
|
225 |
end_date = diagnoses[-1][0] |
|
|
226 |
print("~~~~~~~~~~~~~~~~") |
|
|
227 |
print("Start Date: " + str(start_date)) |
|
|
228 |
print("End Date: " + str(end_date)) |
|
|
229 |
print("Num. of Entries: " + str(len(diagnoses))) |
|
|
230 |
print("Num. of Visits: " + str(len(date_to_diagnoses))) |
|
|
231 |
# print("Chronic Diagnoses: " + str(chronic_diagnoses)) |
|
|
232 |
elif command == 'encounter': |
|
|
233 |
encounters = get_encounters(empi) |
|
|
234 |
for enc in encounters: |
|
|
235 |
print(enc) |
|
|
236 |
#get_encounters_details(empi) |
|
|
237 |
elif command == 'labs': |
|
|
238 |
""" |
|
|
239 |
lab_counts, lab_lows, lab_highs, lab_latest = get_labs_before_date(empi, extract_data.parse_date('11/16/2015')) |
|
|
240 |
for lab in lab_counts: |
|
|
241 |
print(lab) |
|
|
242 |
print('COUNT: ' + str(lab_counts[lab])) |
|
|
243 |
print('LOWS: ' + str(lab_lows[lab]) if lab in lab_lows else 'LOWS: 0') |
|
|
244 |
print('HIGHS: ' + str(lab_highs[lab]) if lab in lab_highs else 'HIGHS: 0') |
|
|
245 |
print('LATEST: ' + str(lab_latest[lab])) |
|
|
246 |
print('') |
|
|
247 |
""" |
|
|
248 |
operation_date = build_graphs.get_operation_date(loader.get_patient_by_EMPI(empi)) |
|
|
249 |
lab_values = get_recent_lab_values(empi, operation_date) |
|
|
250 |
for lab in lab_values: |
|
|
251 |
print(str(lab) + ": " + str(lab_values[lab])) |