b/tables.py
+from collections import defaultdict, Counter
+from datetime import date, datetime, timedelta
+import numpy as np
+from extract_data import get_operation_date, get_ef_values
+from language_processing import parse_m_d_y
+from loader import get_data
+def get_baseline_lab_value(p, lab_types, procedure_date):
+    for lab in p['Lab']:
+        if lab['Test_Description'] in lab_types:
+            date = datetime.strptime(lab['Seq_Date_Time'], "%m/%d/%Y %H:%M").date()
+            if date == procedure_date:
+                try:
+                    return float(lab['Result'])
+                except:
+                    return None
+    return None
+def filter_out_post_procedure(documents, procedure_date, date_key):
+    doc_list = []
+    for doc in documents:
+        date = parse_m_d_y(doc[date_key])
+        p_delta = (date - procedure_date).days
+        if p_delta <= 0:
+            doc_list.append((p_delta, doc))
+    return sorted(doc_list)
+def get_n_preprocedure_dia(diagnoses, procedure_date, n):
+    d_list = filter_out_post_procedure(diagnoses, procedure_date, 'Date')
+    n = min(n, len(d_list))
+    return map(lambda x: x[1], d_list[-1*n:])
+class Range:
+    def __init__(self, low, high):
+        self.low = self.icd9_to_float(low)
+        self.high = self.icd9_to_float(high)
+    def icd9_to_float(self, icd9):
+        try:
+            return float(icd9)
+        except ValueError:
+            extension = float(icd9[1:])
+            letter = icd9[0]
+            return ord(letter)*1000 + extension
+    def __eq__(self, other):
+        other = self.icd9_to_float(other)
+        return other >= self.low and other <= self.high
+    def __ne__(self, other):
+        return not self.__eq__(other)
+cpt = {
+    'crt_out': [33224, 33225, 33226]
+}
+icds = {
+    'crt_in': [00.50, 00.51],
+    'ischemic': [410.0, 410.01, 410.02, 410.1, 410.10, 410.11, 410.12, 410.2, 410.20, 410.21, 410.22, 410.3, 410.30, 410.31, 410.32, 410.4, 410.40, 410.41, 410.42, 410.5, 410.50, 410.51, 410.52, 410.6, 410.60, 410.61, 410.62, 410.7, 410.70, 410.71, 410.72, 410.8, 410.80, 410.81, 410.82, 410.9, 410.90, 410.91, 410.92, 411.0, 411.1, 411.8, 411.81, 411.89, 412.0, 413.0, 413.1, 413.9, 414.0, 414.00, 414.01, 414.02, 414.03, 414.04, 414.05, 414.06, 414.07, 414.1, 414.10, 414.11, 414.12, 414.19, 414.2, 414.3, 414.4, 414.8, 414.9],
+    'non-ischemic': [425.4],
+    'arrhythmia': [427.1, 427.4, 427.41, 427.42, 427.5, 427.9],
+    'lbbb': [426.3, 426.2, 426.51, 426.52, 426.53],
+    'av_block': [426.0],
+    'afib': [427.31],
+    'cpd': [Range(490, 492.8), Range(493.00, 493.92), Range(494, 494.1), Range(495.0, 505), 506.4],
+    'diabetes': [Range(250.00, 250.33), Range(250.40, 250.93)],
+    'renal_disease': [403.01, 403.11, 403.91, 404.02, 404.03, 404.12, 404.13, 404.92, 404.93, 585, 586, 'V42.0', 'V45.1', Range('V56.0', 'V56.2'), 'V56.8']
+}
+def get_ef_delta(patient_data):
+    after_threshold = 365
+    ef_values = get_ef_values(patient_data)
+    sorted_ef = sorted(ef_values)
+    before = None
+    before_date = None
+    after = None
+    after_date = None
+    dist_from_thresh = float('inf')
+    for (rel_date, ef_value) in sorted_ef:
+        if rel_date <= 0:
+            before = ef_value
+            before_date = rel_date
+        else:
+            dist = abs(rel_date - after_threshold)
+            if dist < dist_from_thresh:
+                after = ef_value
+                after_date = rel_date
+                dist_from_thresh = dist
+    if before is not None and after is not None:
+        return (after - before, before, after, before_date, after_date)
+    else:
+        return (None, None, None, None, None)
+# Collect statistics
+has_procedure = 0
+has_baseline = 0
+no_baseline = []
+has_followup = 0
+stats = defaultdict(list)
+total = 1056
+for i in range(total - 1):
+    p = get_data([i])[0]
+    print str(i) + " - " + p['EMPI']
+    procedure_date = get_operation_date(p)
+    if procedure_date:
+        has_procedure += 1
+        (ef_delta, baseline_ef, followup_ef, baseline_date, followup_date) = get_ef_delta(p)
+        if not baseline_ef:
+            no_baseline.append(p['EMPI'])
+        if baseline_ef and baseline_date > -60:
+            has_baseline += 1
+            if followup_date > 100 and followup_date < 500:
+                has_followup += 1
+                stats['procedure_date'].append(procedure_date)
+                stats['baseline_days'].append(baseline_date)
+                stats['followup_days'].append(followup_date)
+                stats['baseline_lvef'].append(baseline_ef)
+                stats['lvef_followup'].append(followup_ef)
+                stats['lvef_change'].append(ef_delta)
+                stats['sex'].append(p['Gender'])
+                stats['n_enc'].append(len(filter_out_post_procedure(p['Enc'], procedure_date, 'Admit_Date')))
+                if p['Date_Of_Death']:
+                    death_date = parse_m_d_y(p['Date_Of_Death'])
+                    stats['died_in_year'].append((death_date - procedure_date) < timedelta(365))
+                else:
+                    stats['died_in_year'].append(False)
+                dia = get_n_preprocedure_dia(p['Dia'], procedure_date, 76)
+                icd_present = defaultdict(lambda : False)
+                for d in dia:
+                    if d['Code_Type'] == 'ICD9':
+                        try:
+                            code = float(d['Code'])
+                        except ValueError:
+                            code = d['Code']
+                        for key in icds.keys():
+                            if code in icds[key]:
+                                icd_present[key] = True
+                    """
+                    elif d['Code_Type'] == 'CPT':
+                        try:
+                            code = float(d['Code'])
+                            for key in cpt.keys():
+                                if code in cpt[key]:
+                                    icd_present[key] = True
+                        except:
+                            pass
+                    """
+                for key in icds.keys():
+                    stats[key].append(icd_present[key])
+                stats['baseline_creatinine'].append(get_baseline_lab_value(p, ['Plasma Creatinine', 'Creatinine'], procedure_date))
+                stats['baseline_sodium'].append(get_baseline_lab_value(p, ['Plasma Sodium'], procedure_date))
+                stats['baseline_hgb'].append(get_baseline_lab_value(p, ['HGB'], procedure_date))
+print "Total: " + str(total)
+print "Has Procedure: " + str(has_procedure)
+print "Has Baseline: " + str(has_baseline)
+print "Has Follow up: " + str(has_followup)
+print "No Baseline:"
+print no_baseline
+print "Demographics:"
+print "Num: " + str(len(stats['procedure_date']))
+sex = Counter(stats['sex'])
+print "Male: " + str(sex["Male"]/float(sum(sex.values())))
+print "\nMGH Care:"
+iqr = np.subtract(*np.percentile(stats['n_enc'], [75, 25]))
+print "Median Pre-Procedure Encounters: " + str(np.median(stats['n_enc'])) + " (" + str(iqr) + ")"
+print "\nDiagnoses:"
+ischemic = Counter(stats['ischemic'])
+print "Ischemic: " + str(ischemic[True]/float(sum(ischemic.values())))
+nonischemic = Counter(stats['non-ischemic'])
+print "Non-Ischemic: " + str(nonischemic[True]/float(sum(nonischemic.values())))
+lbbb = Counter(stats['lbbb'])
+print "lbbb: " + str(lbbb[True]/float(sum(lbbb.values())))
+arrhythmia = Counter(stats['arrhythmia'])
+print "arrhythmia: " + str(arrhythmia[True]/float(sum(arrhythmia.values())))
+av_block = Counter(stats['av_block'])
+print "av_block: " + str(av_block[True]/float(sum(av_block.values())))
+afib = Counter(stats['afib'])
+print "afib: " + str(afib[True]/float(sum(afib.values())))
+crt_in = Counter(stats['crt_in'])
+print "crt_in: " + str(crt_in[True]/float(sum(crt_in.values())))
+#crt_out = Counter(stats['crt_out'])
+#print "crt_out: " + str(crt_out[True]/float(sum(crt_out.values())))
+print "\nComorbidities:"
+cpd = Counter(stats['cpd'])
+print "cpd: " + str(cpd[True]/float(sum(cpd.values())))
+diabetes = Counter(stats['diabetes'])
+print "diabetes: " + str(diabetes[True]/float(sum(diabetes.values())))
+renal_disease = Counter(stats['renal_disease'])
+print "renal_disease: " + str(renal_disease[True]/float(sum(renal_disease.values())))
+#iqr = np.subtract(*np.percentile(x, [75, 25]))
+print "\nBaseline Data:"
+lvef_array = filter(lambda x: bool(x), stats['baseline_lvef'])
+print "LVEF: " + str(np.mean(lvef_array)) + " (" + str(np.std(lvef_array)) + ")"
+creatinine_array = filter(lambda x: bool(x), stats['baseline_creatinine'])
+print "Creatinine: " + str(np.mean(creatinine_array)) + " (" + str(np.std(creatinine_array)) + ")"
+sodium_array = filter(lambda x: bool(x), stats['baseline_sodium'])
+print "Sodium: " + str(np.mean(sodium_array)) + " (" + str(np.std(sodium_array)) + ")"
+hgb_array = filter(lambda x: bool(x), stats['baseline_hgb'])
+print "HGB: " + str(np.mean(hgb_array)) + " (" + str(np.std(hgb_array)) + ")"
+print "\nMedications:"
+print "\nYear:"
+print "Earliest: " + str(sorted(stats['procedure_date'])[:10])
+print "Latest: " + str(max(stats['procedure_date']))
+pre_2009 = Counter(map(lambda x: x < date(2009, 1, 1), stats['procedure_date']))
+print "Pre-2009: " + str(pre_2009[True]/float(sum(pre_2009.values())))
+p_2009_2012 = Counter(map(lambda x: x >= date(2009, 1, 1) and x < date(2013, 1, 1), stats['procedure_date']))
+print "2009-2012: " + str(p_2009_2012[True]/float(sum(p_2009_2012.values())))
+p_2012 = Counter(map(lambda x: x >= date(2013, 1, 1), stats['procedure_date']))
+print "post-2012: " + str(p_2012[True]/float(sum(p_2012.values())))
+print "\nTable 2"
+base_lvef_days = filter(lambda x: x is not None, stats['baseline_days'])
+print "Baseline Days: " + str(np.mean(base_lvef_days)) + " (" + str(np.std(base_lvef_days)) + ")"
+#print base_lvef_days
+lvef_days = filter(lambda x: x is not None, stats['followup_days'])
+#print lvef_days
+print "Followup Days: " + str(np.mean(lvef_days)) + " (" + str(np.std(lvef_days)) + ")"
+lvef_followup = filter(lambda x: x is not None, stats['lvef_followup'])
+print "Followup LVEF: " + str(np.mean(lvef_followup)) + " (" + str(np.std(lvef_followup)) + ")"
+lvef_change = filter(lambda x: x is not None, stats['lvef_change'])
+print "LVEF Change: " + str(np.mean(lvef_change)) + " (" + str(np.std(lvef_change)) + ")"
+def change_to_response(x):
+    if x < 5:
+        return "Non-Responder"
+    elif x < 15:
+        return "Responder"
+    else:
+        return "Super-Responder"
+lvef_response = Counter(map(change_to_response, lvef_change))
+print "Non-Responder: " + str(lvef_response['Non-Responder']/float(sum(lvef_response.values())))
+print "Responder: " + str(lvef_response['Responder']/float(sum(lvef_response.values())))
+print "Super-Responder: " + str(lvef_response['Super-Responder']/float(sum(lvef_response.values())))
+died_in_year = Counter(stats['died_in_year'])
+print "Died within 1 year: " + str(died_in_year[True]/float(sum(died_in_year.values())))