a b/baseline_transformer.py
1
import datetime
2
3
import numpy as np
4
from sklearn.base import TransformerMixin
5
from sklearn.feature_extraction.text import TfidfVectorizer 
6
7
import structured_data_extractor
8
import build_graphs
9
import loader
10
import extract_data
11
import language_processing
12
13
class SexTransformer(TransformerMixin):
14
    """
15
    transforms EMPI into 'male', 'female' column
16
    """
17
    def fit(self, X, y = None, **fit_params):
18
        return self
19
20
    def transform(self, X, **transform_params):
21
        transformed_X = map(self.get_sex, X)
22
        return np.matrix(transformed_X).transpose()
23
24
    def get_sex(self, empi):
25
        person = loader.get_patient_by_EMPI(empi)
26
        if 'Sex' in person:
27
            sex = person['Gender']
28
            return int(sex == 'Female\r\n')
29
        else:
30
            return 0
31
32
    def get_feature_names(self):
33
        return ["sex_female"] 
34
35
class GetConcatenatedNotesTransformer(TransformerMixin):
36
    """Takes as input the type of note (i.e. 'Car' or 'Lno').
37
    For each empi x in the input vector X, it returns a concatentation of
38
    all the pre-procedure notes of the type specified for the patient with that empi."""
39
    def __init__(self, note_type, look_back_months=None):
40
        self.type = note_type
41
        self.look_back_months = look_back_months
42
43
    def fit(self, X, y=None, **fit_params):
44
        return self
45
46
    def transform(self, X, **transform_params):
47
        transformed_X = map(self.get_concatenated_notes, X)
48
        return transformed_X 
49
50
    def get_concatenated_notes(self, empi):
51
        person = loader.get_patient_by_EMPI(empi)
52
        operation_date = build_graphs.get_operation_date(person)
53
        date_key = extract_data.get_date_key(self.type)
54
        notes = []
55
        sec_per_month = 24 * 60 * 60 * (365.0 / 12)
56
        if self.type in person.keys() and date_key != None:
57
            for i in range(len(person[self.type])):
58
                doc = person[self.type][i]
59
                date = extract_data.parse_date(doc[date_key])
60
                if date != None and date < operation_date:
61
                    if self.look_back_months and (operation_date - date).total_seconds() > (self.look_back_months * sec_per_month):
62
                        continue
63
                    notes.append(doc['free_text'])
64
        return '\n\n'.join(notes)        
65
66
class GetLatestNotesTransformer(TransformerMixin):
67
    """Similar to the transformer above, but takes in an extra parameter max_notes
68
    that limits the number of notes to incorporate, indexed from the procedure
69
    date going back in time, and returns an array of notes instead of a concatentation.
70
    For example, if you put max_notes to be 1, then it would return a single-element
71
    array with the text of the note closest to, but not including, the procedure date
72
    (for each empi in the input vector).""" 
73
    def __init__(self, note_type, max_notes):
74
        self.type = note_type
75
        self.max_notes = max_notes
76
77
    def fit(self, X, y=None, **fit_params):
78
        return self
79
80
    def transform(self, X, **transform_params):
81
        transformed_X = map(self.get_latest_concatenated_notes, X)
82
        return transformed_X 
83
84
    def get_feature_names(self):
85
        names = ['latest_note_' + str(i) for i in range(self.max_notes)]
86
        return np.array(names) 
87
88
    def get_latest_concatenated_notes(self, empi):
89
        person = loader.get_patient_by_EMPI(empi)
90
        operation_date = build_graphs.get_operation_date(person)
91
        date_key = extract_data.get_date_key(self.type)
92
        notes = []
93
        if self.type in person.keys() and date_key != None:
94
            time_key_pairs = []
95
            for i in range(len(person[self.type])):
96
                doc = person[self.type][i]
97
                date = extract_data.parse_date(doc[date_key])
98
                if date != None and date < operation_date:
99
                    time_key_pairs.append((operation_date - date, i))
100
            time_key_pairs.sort()
101
            for time,key in time_key_pairs[:self.max_notes]:
102
                doc = person[self.type][key]
103
                notes.append(doc['free_text'])
104
        # ensure that notes vector length is equal to max_notes
105
        if len(notes) < self.max_notes:
106
            delta = self.max_notes - len(notes)
107
            for i in range(delta):
108
                notes.append('')  
109
        return np.array(notes)
110
111
class DocumentConcatenatorTransformer(TransformerMixin):
112
    def fit(self, X, y=None, **fit_params):
113
        return self
114
115
    def transform(self, X, **transform_params):
116
        transformed_X = map(self.concatenate_notes, X)
117
        return transformed_X 
118
119
    def concatenate_notes(self, docs):
120
        return '\n\n'.join(docs)
121
122
class MultiDocTfidfTransformer(TransformerMixin):
123
    """
124
    Returns a vector of TFIDF vectors for each string in a vector. TFIDF
125
    weightings are global across all elements of the document.
126
    """
127
    def __init__(self):#, ngram_range=(1,1)):
128
        self.tfidf = TfidfVectorizer()#ngram_range=ngram_range)
129
        self.vec_size = 0 
130
131
    def fit(self, X, y=None, **fit_params):
132
        self.vec_size = len(X[0])
133
        self.tfidf.fit(map(lambda x: '\n\n'.join(x), X))
134
        return self
135
    
136
    def transform(self, X, **transform_params):
137
        tX = map(lambda x: self.tfidf.transform(x).toarray().flatten(), X)
138
        return tX
139
140
    def get_feature_names(self):
141
        feature_arr = map(lambda i: self.tfidf.get_feature_names(), range(self.vec_size))
142
        return np.array(feature_arr).flatten()
143
144
145
146
class GetEncountersFeaturesTransformer(TransformerMixin):
147
    """Returns a feature vector for each empi from the encounters history
148
    of that patient.  Check below for details as it may change, but in general
149
    the feature vector will have two parts: (a) small feature vector for each
150
    of the encounters before the operation (with max_encounters as the max);
151
    (b) a series of features derived from the overall encounter history for the
152
    given patient (such as averages, sums, counts, maximums, etc.).
153
    Setting only_general flag to True returns only features in (b)."""
154
    def __init__(self, max_encounters, only_general=False):
155
        self.max_encounters = max_encounters
156
        self.only_general = only_general
157
158
    def fit(self, X, y=None, **fit_params):
159
        return self
160
161
    def transform(self, X, **transform_params):
162
        transformed_X = map(self.get_encounters_features, X)
163
        return transformed_X
164
165
    def get_feature_names(self):
166
        names = []
167
        for i in range(self.max_encounters):
168
            names.append('Inpatient_Outpatient_Enc_' + str(i))
169
            names.append('LOS_Enc_' + str(i))
170
            names.append('Num_Extra_Diagnoses_Enc_' + str(i))
171
        names.append('Enc_Inpatient_Ratio')
172
        names.append('Enc_Average_LOS')
173
        names.append('Enc_Average_Extra_Diagnoses')
174
        return np.array(names)
175
    
176
    def get_encounters_features(self, empi):
177
        encounters = structured_data_extractor.get_encounters(empi)
178
        person = loader.get_patient_by_EMPI(empi)
179
        operation_date = build_graphs.get_operation_date(person)
180
        operation_index = 0
181
        for enc in encounters:
182
            if enc[0] < operation_date:
183
                operation_index += 1
184
            else:
185
                break
186
        # only look at encounters before the operation
187
        encounters = encounters[:operation_index]
188
        features = []
189
        # INDIVIDUAL ENCOUNTER FEATURES (3 x max_encounters)
190
        num_tracked_encounters = min(self.max_encounters, len(encounters))
191
        # tracked_encounters below is sorted by increasing absolute time delta with operation date
192
        tracked_encounters = encounters[::-1][:num_tracked_encounters]
193
        inpatients = 0
194
        total_LOS = 0
195
        total_extra_diagnoses = 0
196
        for enc in tracked_encounters:
197
            # INDIVIDUAL FEATURE 1 - Inpatient vs. Outpatient
198
            if enc[1] == 'Inpatient':
199
                features.append(1)
200
                inpatients += 1
201
            else:
202
                features.append(0)
203
            # INDIVIDUAL FEATURE 2 - Length of Stay
204
            if enc[3] > 1:
205
                features.append(enc[3])
206
                total_LOS += enc[3]
207
            else:
208
                features.append(0)
209
            # INDIVIDUAL FEATURE 3 - Number of Extra Diagnoses
210
            features.append(enc[4])
211
            total_extra_diagnoses += enc[4]
212
        # fill in remaining vector space with zeros to make vector size = 3 x max_encounters
213
        if num_tracked_encounters < self.max_encounters:
214
            delta = self.max_encounters - num_tracked_encounters
215
            for i in range(delta):
216
                for j in range(3):
217
                    features.append(0)
218
        # OVERALL ENCOUNTERS FEATURES (3)
219
        # OVERALL FEATURE 1 - Inpatient Ratio
220
        if len(tracked_encounters) > 0:
221
            features.append(inpatients / len(tracked_encounters))
222
        else:
223
            features.append(0)
224
        # OVERALL FEATURE 2 - Average LOS
225
        if inpatients > 0:
226
            features.append(total_LOS / inpatients)
227
        else:
228
            features.append(0)
229
        # OVERALL FEATURE 3 - Average Extra Diagnoses
230
        if len(tracked_encounters) > 0:
231
            features.append(total_extra_diagnoses / len(tracked_encounters))
232
        else:
233
            features.append(0)
234
        if self.only_general:
235
            features = features[-3:]
236
        return np.array(features) 
237
238
class GetLabsCountsDictTransformer(TransformerMixin):
239
    """For each empi, will return a dictionary of lab test names to a count
240
    of the amount of times that patient has received that test before the
241
    operation. Output should then be fed into DictVectorizer."""
242
    def fit(self, X, y=None, **fit_params):
243
        return self
244
245
    def transform(self, X, **transform_params):
246
        transformed_X = map(self.get_labs_counts, X)
247
        return transformed_X
248
    
249
    def get_labs_counts(self, empi):
250
        person = loader.get_patient_by_EMPI(empi)
251
        operation_date = build_graphs.get_operation_date(person)
252
        return structured_data_extractor.get_labs_before_date(empi, operation_date)[0]
253
254
class GetLabsLowCountsDictTransformer(TransformerMixin):
255
    """For each empi, will return a dictionary of lab test names to a count
256
    of the amount of times that patient has received that test before the
257
    operation and the test value was low. Output should then be fed into DictVectorizer."""
258
    def fit(self, X, y=None, **fit_params):
259
        return self
260
261
    def transform(self, X, **transform_params):
262
        transformed_X = map(self.get_low_counts, X)
263
        return transformed_X
264
    
265
    def get_low_counts(self, empi):
266
        person = loader.get_patient_by_EMPI(empi)
267
        operation_date = build_graphs.get_operation_date(person)
268
        return structured_data_extractor.get_labs_before_date(empi, operation_date)[1]
269
270
class GetLabsHighCountsDictTransformer(TransformerMixin):
271
    """For each empi, will return a dictionary of lab test names to a count
272
    of the amount of times that patient has received that test before the
273
    operation and the test value was high. Output should then be fed into DictVectorizer."""
274
    def fit(self, X, y=None, **fit_params):
275
        return self
276
277
    def transform(self, X, **transform_params):
278
        transformed_X = map(self.get_high_counts, X)
279
        return transformed_X
280
    
281
    def get_high_counts(self, empi):
282
        person = loader.get_patient_by_EMPI(empi)
283
        operation_date = build_graphs.get_operation_date(person)
284
        return structured_data_extractor.get_labs_before_date(empi, operation_date)[2]
285
286
class GetLabsLatestHighDictTransformer(TransformerMixin):
287
    """For each empi, will return a dictionary of lab test names to a boolean
288
    indicating if the test value was high the last time the patient received
289
    that test (before the procedue). Output should then be fed into DictVectorizer."""
290
    def fit(self, X, y=None, **fit_params):
291
        return self
292
293
    def transform(self, X, **transform_params):
294
        transformed_X = map(self.get_labs_latest_high, X)
295
        return transformed_X
296
    
297
    def get_labs_latest_high(self, empi):
298
        person = loader.get_patient_by_EMPI(empi)
299
        operation_date = build_graphs.get_operation_date(person)
300
        labs_latest = structured_data_extractor.get_labs_before_date(empi, operation_date)[3]
301
        labs_latest_high = {}
302
        for lab in labs_latest:
303
            if labs_latest[lab][1] == 'H':
304
                labs_latest_high[lab] = 1
305
            else:
306
                labs_latest_high[lab] = 0
307
        return labs_latest_high
308
309
class GetLabsLatestLowDictTransformer(TransformerMixin):
310
    """For each empi, will return a dictionary of lab test names to a boolean
311
    indicating if the test value was low the last time the patient received
312
    that test (before the procedue). Output should then be fed into DictVectorizer.""" 
313
    def fit(self, X, y=None, **fit_params):
314
        return self
315
316
    def transform(self, X, **transform_params):
317
        transformed_X = map(self.get_labs_latest_low, X)
318
        return transformed_X
319
    
320
    def get_labs_latest_low(self, empi):
321
        person = loader.get_patient_by_EMPI(empi)
322
        operation_date = build_graphs.get_operation_date(person)
323
        labs_latest = structured_data_extractor.get_labs_before_date(empi, operation_date)[3]
324
        labs_latest_low = {}
325
        for lab in labs_latest:
326
            if labs_latest[lab][1] == 'L':
327
                labs_latest_low[lab] = 1
328
            else:
329
                labs_latest_low[lab] = 0
330
        return labs_latest_low
331
332
class GetLabsHistoryDictTransformer(TransformerMixin):
333
    """For each empi, will return a dictionary where keys are a concatenation of
334
    the lab test name, H(igh) or L(ow), and the time threshold looking back (i.e. "NA_H_6"
335
    would be testing for High results on the NA test around 6 months before prcedure).  
336
    The value is just a boolean indicating if this result was high (for H) or low (for L).
337
    Output should then be fed into DictVectorizer.""" 
338
    def __init__(self, time_thresholds_months):
339
        self.time_thresholds_months = time_thresholds_months
340
341
    def fit(self, X, y=None, **fit_params):
342
        return self
343
344
    def transform(self, X, **transform_params):
345
        transformed_X = map(self.get_labs_history, X)
346
        return transformed_X
347
    
348
    def get_labs_history(self, empi):
349
        person = loader.get_patient_by_EMPI(empi)
350
        operation_date = build_graphs.get_operation_date(person)
351
        lab_history = structured_data_extractor.get_lab_history_before_date(empi, operation_date, self.time_thresholds_months)
352
        lab_history_transformed = {}
353
        for lab in lab_history:
354
            for i in range(len(self.time_thresholds_months)):
355
                lab_history_transformed[lab + '_H_' + str(self.time_thresholds_months[i])] = 1 if lab_history[lab][i] == 'H' else 0
356
                lab_history_transformed[lab + '_L_' + str(self.time_thresholds_months[i])] = 1 if lab_history[lab][i] == 'L' else 0
357
        return lab_history_transformed
358
359
class GetLatestLabValuesTransformer(TransformerMixin):
360
    def fit(self, X, y=None, **fit_params):
361
        return self
362
363
    def transform(self, X, **transform_params):
364
        transformed_X = map(self.get_latest_lab_values, X)
365
        return transformed_X
366
    
367
    def get_latest_lab_values(self, empi):
368
        person = loader.get_patient_by_EMPI(empi)
369
        operation_date = build_graphs.get_operation_date(person)
370
        latest_labs = structured_data_extractor.get_recent_lab_values(empi, operation_date) 
371
        latest_lab_values = {}
372
        for lab in latest_labs:
373
            if latest_labs[lab][1]:
374
                try:
375
                    latest_lab_values[lab] = float(latest_labs[lab][1])
376
                except:
377
                    latest_lab_values[lab] = latest_labs[lab][1]
378
        return latest_lab_values           
379
 
380
if __name__ == '__main__':
381
    labsTransformer = GetLatestLabValuesTransformer()
382
    labs = labsTransformer.get_latest_lab_values("FAKE_EMPI_648")
383
    for lab in labs:
384
        print(lab + ": " + str(labs[lab]))