a b/tables.py
1
from collections import defaultdict, Counter
2
from datetime import date, datetime, timedelta
3
import numpy as np
4
5
from extract_data import get_operation_date, get_ef_values
6
from language_processing import parse_m_d_y
7
from loader import get_data
8
9
def get_baseline_lab_value(p, lab_types, procedure_date):
10
    for lab in p['Lab']:
11
        if lab['Test_Description'] in lab_types:
12
            date = datetime.strptime(lab['Seq_Date_Time'], "%m/%d/%Y %H:%M").date()
13
            if date == procedure_date:
14
                try:
15
                    return float(lab['Result'])
16
                except:
17
                    return None
18
    return None
19
20
def filter_out_post_procedure(documents, procedure_date, date_key):
21
    doc_list = []
22
    for doc in documents:
23
        date = parse_m_d_y(doc[date_key])
24
        p_delta = (date - procedure_date).days
25
        if p_delta <= 0:
26
            doc_list.append((p_delta, doc))
27
    return sorted(doc_list)
28
29
30
def get_n_preprocedure_dia(diagnoses, procedure_date, n):
31
    d_list = filter_out_post_procedure(diagnoses, procedure_date, 'Date')
32
    n = min(n, len(d_list))
33
    return map(lambda x: x[1], d_list[-1*n:])
34
35
class Range:
36
    def __init__(self, low, high):
37
        self.low = self.icd9_to_float(low)
38
        self.high = self.icd9_to_float(high)
39
    def icd9_to_float(self, icd9):
40
        try:
41
            return float(icd9)
42
        except ValueError:
43
            extension = float(icd9[1:])
44
            letter = icd9[0]
45
            return ord(letter)*1000 + extension
46
    def __eq__(self, other):
47
        other = self.icd9_to_float(other)
48
        return other >= self.low and other <= self.high
49
    def __ne__(self, other):
50
        return not self.__eq__(other)
51
52
cpt = {
53
    'crt_out': [33224, 33225, 33226]
54
}
55
icds = {
56
    'crt_in': [00.50, 00.51],
57
    'ischemic': [410.0, 410.01, 410.02, 410.1, 410.10, 410.11, 410.12, 410.2, 410.20, 410.21, 410.22, 410.3, 410.30, 410.31, 410.32, 410.4, 410.40, 410.41, 410.42, 410.5, 410.50, 410.51, 410.52, 410.6, 410.60, 410.61, 410.62, 410.7, 410.70, 410.71, 410.72, 410.8, 410.80, 410.81, 410.82, 410.9, 410.90, 410.91, 410.92, 411.0, 411.1, 411.8, 411.81, 411.89, 412.0, 413.0, 413.1, 413.9, 414.0, 414.00, 414.01, 414.02, 414.03, 414.04, 414.05, 414.06, 414.07, 414.1, 414.10, 414.11, 414.12, 414.19, 414.2, 414.3, 414.4, 414.8, 414.9],
58
    'non-ischemic': [425.4],
59
    'arrhythmia': [427.1, 427.4, 427.41, 427.42, 427.5, 427.9],
60
    'lbbb': [426.3, 426.2, 426.51, 426.52, 426.53],
61
    'av_block': [426.0],
62
    'afib': [427.31],
63
    'cpd': [Range(490, 492.8), Range(493.00, 493.92), Range(494, 494.1), Range(495.0, 505), 506.4],
64
    'diabetes': [Range(250.00, 250.33), Range(250.40, 250.93)],
65
    'renal_disease': [403.01, 403.11, 403.91, 404.02, 404.03, 404.12, 404.13, 404.92, 404.93, 585, 586, 'V42.0', 'V45.1', Range('V56.0', 'V56.2'), 'V56.8']
66
}
67
68
def get_ef_delta(patient_data):
69
    after_threshold = 365
70
    ef_values = get_ef_values(patient_data)
71
    sorted_ef = sorted(ef_values)
72
    before = None
73
    before_date = None
74
    after = None
75
    after_date = None
76
    dist_from_thresh = float('inf')
77
    for (rel_date, ef_value) in sorted_ef:
78
        if rel_date <= 0:
79
            before = ef_value
80
            before_date = rel_date
81
        else:
82
            dist = abs(rel_date - after_threshold)
83
            if dist < dist_from_thresh:
84
                after = ef_value
85
                after_date = rel_date
86
                dist_from_thresh = dist
87
    if before is not None and after is not None:
88
        return (after - before, before, after, before_date, after_date)
89
    else:
90
        return (None, None, None, None, None)
91
92
# Collect statistics
93
has_procedure = 0
94
has_baseline = 0 
95
no_baseline = []
96
has_followup = 0
97
stats = defaultdict(list) 
98
total = 1056
99
for i in range(total - 1):
100
    p = get_data([i])[0]
101
    print str(i) + " - " + p['EMPI']
102
103
    procedure_date = get_operation_date(p)
104
    if procedure_date:
105
        has_procedure += 1
106
        (ef_delta, baseline_ef, followup_ef, baseline_date, followup_date) = get_ef_delta(p)    
107
        if not baseline_ef:
108
            no_baseline.append(p['EMPI'])
109
        if baseline_ef and baseline_date > -60:
110
            has_baseline += 1
111
            if followup_date > 100 and followup_date < 500:
112
                has_followup += 1
113
                stats['procedure_date'].append(procedure_date)
114
                stats['baseline_days'].append(baseline_date)
115
                stats['followup_days'].append(followup_date)
116
                stats['baseline_lvef'].append(baseline_ef)
117
                stats['lvef_followup'].append(followup_ef)
118
                stats['lvef_change'].append(ef_delta)
119
120
                stats['sex'].append(p['Gender'])
121
                stats['n_enc'].append(len(filter_out_post_procedure(p['Enc'], procedure_date, 'Admit_Date')))
122
123
                if p['Date_Of_Death']:
124
                    death_date = parse_m_d_y(p['Date_Of_Death'])
125
                    stats['died_in_year'].append((death_date - procedure_date) < timedelta(365))
126
                else:
127
                    stats['died_in_year'].append(False)
128
129
                dia = get_n_preprocedure_dia(p['Dia'], procedure_date, 76)
130
                icd_present = defaultdict(lambda : False)
131
                for d in dia:
132
                    if d['Code_Type'] == 'ICD9':
133
                        try:
134
                            code = float(d['Code'])
135
                        except ValueError:
136
                            code = d['Code']
137
                        for key in icds.keys():
138
                            if code in icds[key]:
139
                                icd_present[key] = True
140
                    """
141
                    elif d['Code_Type'] == 'CPT':
142
                        try:
143
                            code = float(d['Code'])
144
                            for key in cpt.keys():
145
                                if code in cpt[key]:
146
                                    icd_present[key] = True
147
                        except:
148
                            pass
149
                    """
150
151
                            
152
                for key in icds.keys():
153
                    stats[key].append(icd_present[key])
154
155
                stats['baseline_creatinine'].append(get_baseline_lab_value(p, ['Plasma Creatinine', 'Creatinine'], procedure_date))
156
                stats['baseline_sodium'].append(get_baseline_lab_value(p, ['Plasma Sodium'], procedure_date))
157
                stats['baseline_hgb'].append(get_baseline_lab_value(p, ['HGB'], procedure_date))
158
159
print "Total: " + str(total)
160
print "Has Procedure: " + str(has_procedure)
161
print "Has Baseline: " + str(has_baseline)
162
print "Has Follow up: " + str(has_followup)
163
print "No Baseline:"
164
print no_baseline
165
166
print "Demographics:"
167
print "Num: " + str(len(stats['procedure_date']))
168
sex = Counter(stats['sex'])
169
print "Male: " + str(sex["Male"]/float(sum(sex.values())))
170
171
print "\nMGH Care:"
172
iqr = np.subtract(*np.percentile(stats['n_enc'], [75, 25]))
173
print "Median Pre-Procedure Encounters: " + str(np.median(stats['n_enc'])) + " (" + str(iqr) + ")"
174
175
print "\nDiagnoses:"
176
ischemic = Counter(stats['ischemic'])
177
print "Ischemic: " + str(ischemic[True]/float(sum(ischemic.values())))
178
nonischemic = Counter(stats['non-ischemic'])
179
print "Non-Ischemic: " + str(nonischemic[True]/float(sum(nonischemic.values())))
180
lbbb = Counter(stats['lbbb'])
181
print "lbbb: " + str(lbbb[True]/float(sum(lbbb.values())))
182
arrhythmia = Counter(stats['arrhythmia'])
183
print "arrhythmia: " + str(arrhythmia[True]/float(sum(arrhythmia.values())))
184
av_block = Counter(stats['av_block'])
185
print "av_block: " + str(av_block[True]/float(sum(av_block.values())))
186
afib = Counter(stats['afib'])
187
print "afib: " + str(afib[True]/float(sum(afib.values())))
188
crt_in = Counter(stats['crt_in'])
189
print "crt_in: " + str(crt_in[True]/float(sum(crt_in.values())))
190
#crt_out = Counter(stats['crt_out'])
191
#print "crt_out: " + str(crt_out[True]/float(sum(crt_out.values())))
192
193
print "\nComorbidities:"
194
cpd = Counter(stats['cpd'])
195
print "cpd: " + str(cpd[True]/float(sum(cpd.values())))
196
diabetes = Counter(stats['diabetes'])
197
print "diabetes: " + str(diabetes[True]/float(sum(diabetes.values())))
198
renal_disease = Counter(stats['renal_disease'])
199
print "renal_disease: " + str(renal_disease[True]/float(sum(renal_disease.values())))
200
201
202
#iqr = np.subtract(*np.percentile(x, [75, 25]))
203
print "\nBaseline Data:"
204
lvef_array = filter(lambda x: bool(x), stats['baseline_lvef'])
205
print "LVEF: " + str(np.mean(lvef_array)) + " (" + str(np.std(lvef_array)) + ")"
206
creatinine_array = filter(lambda x: bool(x), stats['baseline_creatinine'])
207
print "Creatinine: " + str(np.mean(creatinine_array)) + " (" + str(np.std(creatinine_array)) + ")"
208
sodium_array = filter(lambda x: bool(x), stats['baseline_sodium'])
209
print "Sodium: " + str(np.mean(sodium_array)) + " (" + str(np.std(sodium_array)) + ")"
210
hgb_array = filter(lambda x: bool(x), stats['baseline_hgb'])
211
print "HGB: " + str(np.mean(hgb_array)) + " (" + str(np.std(hgb_array)) + ")"
212
213
print "\nMedications:"
214
215
216
print "\nYear:"
217
print "Earliest: " + str(sorted(stats['procedure_date'])[:10])
218
print "Latest: " + str(max(stats['procedure_date']))
219
pre_2009 = Counter(map(lambda x: x < date(2009, 1, 1), stats['procedure_date']))
220
print "Pre-2009: " + str(pre_2009[True]/float(sum(pre_2009.values())))
221
p_2009_2012 = Counter(map(lambda x: x >= date(2009, 1, 1) and x < date(2013, 1, 1), stats['procedure_date']))
222
print "2009-2012: " + str(p_2009_2012[True]/float(sum(p_2009_2012.values())))
223
p_2012 = Counter(map(lambda x: x >= date(2013, 1, 1), stats['procedure_date']))
224
print "post-2012: " + str(p_2012[True]/float(sum(p_2012.values())))
225
226
print "\nTable 2"
227
base_lvef_days = filter(lambda x: x is not None, stats['baseline_days'])
228
print "Baseline Days: " + str(np.mean(base_lvef_days)) + " (" + str(np.std(base_lvef_days)) + ")"
229
#print base_lvef_days
230
lvef_days = filter(lambda x: x is not None, stats['followup_days'])
231
#print lvef_days
232
print "Followup Days: " + str(np.mean(lvef_days)) + " (" + str(np.std(lvef_days)) + ")"
233
lvef_followup = filter(lambda x: x is not None, stats['lvef_followup'])
234
print "Followup LVEF: " + str(np.mean(lvef_followup)) + " (" + str(np.std(lvef_followup)) + ")"
235
lvef_change = filter(lambda x: x is not None, stats['lvef_change'])
236
print "LVEF Change: " + str(np.mean(lvef_change)) + " (" + str(np.std(lvef_change)) + ")"
237
def change_to_response(x):
238
    if x < 5:
239
        return "Non-Responder"
240
    elif x < 15:
241
        return "Responder"
242
    else:
243
        return "Super-Responder"
244
lvef_response = Counter(map(change_to_response, lvef_change))
245
print "Non-Responder: " + str(lvef_response['Non-Responder']/float(sum(lvef_response.values())))
246
print "Responder: " + str(lvef_response['Responder']/float(sum(lvef_response.values())))
247
print "Super-Responder: " + str(lvef_response['Super-Responder']/float(sum(lvef_response.values())))
248
died_in_year = Counter(stats['died_in_year'])
249
print "Died within 1 year: " + str(died_in_year[True]/float(sum(died_in_year.values())))