Switch to unified view

a b/tests/featurizers/test_featurizers.py
1
import datetime
2
from typing import Any, List, Mapping, cast
3
4
import femr_test_tools
5
import meds
6
import scipy.sparse
7
8
import femr
9
import femr.index
10
from femr.featurizers import FeaturizerList
11
from femr.featurizers.featurizers import AgeFeaturizer, CountFeaturizer
12
from femr.labelers import TimeHorizon
13
from femr.labelers.omop import CodeLabeler
14
15
16
def _assert_featurized_patients_structure(labels: List[meds.Label], features: Mapping[str, Any]):
17
    assert features["features"].dtype == "float32"
18
    assert features["patient_ids"].dtype == "int64"
19
    assert features["feature_times"].dtype == "datetime64[us]"
20
21
    assert features["feature_times"].shape[0] == len(labels)
22
    assert features["patient_ids"].shape[0] == len(labels)
23
    assert features["features"].shape[0] == len(labels)
24
25
    assert sorted(list(features["patient_ids"])) == sorted(list(label["patient_id"] for label in labels))
26
    assert sorted(list(features["feature_times"])) == sorted(list(label["prediction_time"] for label in labels))
27
28
29
def test_age_featurizer() -> None:
30
    time_horizon = TimeHorizon(datetime.timedelta(days=0), datetime.timedelta(days=180))
31
32
    dataset = femr_test_tools.create_patients_dataset(100)
33
    index = femr.index.PatientIndex(dataset)
34
35
    labeler = CodeLabeler(["2"], time_horizon, ["3"])
36
37
    patient: meds.Patient = dataset[0]
38
    labels = labeler.label(patient)
39
    featurizer = AgeFeaturizer(is_normalize=False)
40
    patient_features = featurizer.featurize(patient, labels)
41
42
    assert patient_features[0] == [(0, 15.43013698630137)]
43
    assert patient_features[1] == [(0, 17.767123287671232)]
44
    assert patient_features[-1] == [(0, 20.46027397260274)]
45
46
    all_labels = labeler.apply(dataset)
47
48
    featurizer = AgeFeaturizer(is_normalize=True)
49
    featurizer_list = FeaturizerList([featurizer])
50
    featurizer_list.preprocess_featurizers(dataset, index, all_labels)
51
    featurized_patients = featurizer_list.featurize(dataset, index, all_labels)
52
53
    _assert_featurized_patients_structure(all_labels, featurized_patients)
54
55
56
def test_count_featurizer() -> None:
57
    time_horizon = TimeHorizon(datetime.timedelta(days=0), datetime.timedelta(days=180))
58
59
    dataset = femr_test_tools.create_patients_dataset(100)
60
    index = femr.index.PatientIndex(dataset)
61
62
    labeler = CodeLabeler(["2"], time_horizon, ["3"])
63
64
    patient: meds.Patient = dataset[0]
65
    labels = labeler.label(patient)
66
    featurizer = CountFeaturizer()
67
    data = featurizer.generate_preprocess_data([patient], {patient["patient_id"]: labels})
68
    featurizer.encorperate_prepreprocessed_data([data])
69
70
    patient_features = featurizer.featurize(patient, labels)
71
72
    assert featurizer.get_num_columns() == 4, f"featurizer.get_num_columns() = {featurizer.get_num_columns()}"
73
74
    simple_patient_features = [{(featurizer.get_column_name(v.column), v.value) for v in a} for a in patient_features]
75
76
    assert simple_patient_features[0] == {
77
        ("SNOMED/184099003", 1),
78
        ("3", 1),
79
    }
80
    assert simple_patient_features[1] == {
81
        ("SNOMED/184099003", 1),
82
        ("3", 2),
83
        ("2", 2),
84
    }
85
    assert simple_patient_features[2] == {
86
        ("SNOMED/184099003", 1),
87
        ("3", 3),
88
        ("2", 4),
89
    }
90
91
    all_labels = labeler.apply(dataset)
92
93
    featurizer = CountFeaturizer()
94
    featurizer_list = FeaturizerList([featurizer])
95
    featurizer_list.preprocess_featurizers(dataset, index, all_labels)
96
    featurized_patients = featurizer_list.featurize(dataset, index, all_labels)
97
98
    _assert_featurized_patients_structure(all_labels, featurized_patients)
99
100
101
def test_count_featurizer_with_ontology() -> None:
102
    time_horizon = TimeHorizon(datetime.timedelta(days=0), datetime.timedelta(days=180))
103
104
    dataset = femr_test_tools.create_patients_dataset(100)
105
    index = femr.index.PatientIndex(dataset)
106
107
    labeler = CodeLabeler(["2"], time_horizon, ["3"])
108
109
    patient: meds.Patient = dataset[0]
110
    labels = labeler.label(patient)
111
112
    class DummyOntology:
113
        def get_all_parents(self, code):
114
            if code in ("2", "SNOMED/184099003"):
115
                return {"parent", code}
116
            else:
117
                return {code}
118
119
    featurizer = CountFeaturizer(is_ontology_expansion=True, ontology=cast(femr.ontology.Ontology, DummyOntology()))
120
    data = featurizer.generate_preprocess_data([patient], {patient["patient_id"]: labels})
121
    featurizer.encorperate_prepreprocessed_data([data])
122
123
    patient_features = featurizer.featurize(patient, labels)
124
125
    assert featurizer.get_num_columns() == 5, f"featurizer.get_num_columns() = {featurizer.get_num_columns()}"
126
127
    simple_patient_features = [{(featurizer.get_column_name(v.column), v.value) for v in a} for a in patient_features]
128
129
    assert simple_patient_features[0] == {
130
        ("SNOMED/184099003", 1),
131
        ("3", 1),
132
        ("parent", 1),
133
    }
134
    assert simple_patient_features[1] == {
135
        ("SNOMED/184099003", 1),
136
        ("3", 2),
137
        ("2", 2),
138
        ("parent", 3),
139
    }
140
    assert simple_patient_features[2] == {
141
        ("SNOMED/184099003", 1),
142
        ("parent", 5),
143
        ("3", 3),
144
        ("2", 4),
145
    }
146
147
    all_labels = labeler.apply(dataset)
148
149
    featurizer = CountFeaturizer(is_ontology_expansion=True, ontology=cast(femr.ontology.Ontology, DummyOntology()))
150
    featurizer_list = FeaturizerList([featurizer])
151
    featurizer_list.preprocess_featurizers(dataset, index, all_labels)
152
    featurized_patients = featurizer_list.featurize(dataset, index, all_labels)
153
154
    _assert_featurized_patients_structure(all_labels, featurized_patients)
155
156
157
def test_count_featurizer_with_values() -> None:
158
    time_horizon = TimeHorizon(datetime.timedelta(days=0), datetime.timedelta(days=180))
159
160
    dataset = femr_test_tools.create_patients_dataset(100)
161
    index = femr.index.PatientIndex(dataset)
162
163
    labeler = CodeLabeler(["2"], time_horizon, ["3"])
164
165
    patient: meds.Patient = dataset[0]
166
    labels = labeler.label(patient)
167
    featurizer = CountFeaturizer(numeric_value_decile=True, string_value_combination=True)
168
    data = featurizer.generate_preprocess_data([patient], {patient["patient_id"]: labels})
169
    featurizer.encorperate_prepreprocessed_data([data])
170
171
    patient_features = featurizer.featurize(patient, labels)
172
173
    assert featurizer.get_num_columns() == 7
174
175
    simple_patient_features = [{(featurizer.get_column_name(v.column), v.value) for v in a} for a in patient_features]
176
177
    assert simple_patient_features[0] == {
178
        ("SNOMED/184099003", 1),
179
        ("3", 1),
180
        ("2 [1.0, inf)", 1),
181
        ("1 test_value", 2),
182
    }
183
184
    assert simple_patient_features[1] == {
185
        ("SNOMED/184099003", 1),
186
        ("3", 2),
187
        ("2", 2),
188
        ("2 [1.0, inf)", 1),
189
        ("1 test_value", 2),
190
    }
191
    assert simple_patient_features[2] == {
192
        ("SNOMED/184099003", 1),
193
        ("3", 3),
194
        ("2", 4),
195
        ("2 [1.0, inf)", 1),
196
        ("1 test_value", 2),
197
    }
198
199
    all_labels = labeler.apply(dataset)
200
201
    featurizer = CountFeaturizer(numeric_value_decile=True, string_value_combination=True)
202
    featurizer_list = FeaturizerList([featurizer])
203
    featurizer_list.preprocess_featurizers(dataset, index, all_labels)
204
    featurized_patients = featurizer_list.featurize(dataset, index, all_labels)
205
206
    _assert_featurized_patients_structure(all_labels, featurized_patients)
207
208
209
def test_count_featurizer_exclude_filter() -> None:
210
    time_horizon = TimeHorizon(datetime.timedelta(days=0), datetime.timedelta(days=180))
211
212
    dataset = femr_test_tools.create_patients_dataset(100)
213
214
    labeler = CodeLabeler(["2"], time_horizon, ["3"])
215
216
    patient: meds.Patient = dataset[0]
217
    labels = labeler.label(patient)
218
219
    # Test filtering all codes
220
    featurizer = CountFeaturizer(excluded_event_filter=lambda _: True)
221
    data = featurizer.generate_preprocess_data([patient], {patient["patient_id"]: labels})
222
    featurizer.encorperate_prepreprocessed_data([data])
223
224
    assert featurizer.get_num_columns() == 0
225
226
    # Test filtering no codes
227
    featurizer = CountFeaturizer(excluded_event_filter=lambda _: False)
228
    data = featurizer.generate_preprocess_data([patient], {patient["patient_id"]: labels})
229
    featurizer.encorperate_prepreprocessed_data([data])
230
231
    assert featurizer.get_num_columns() == 4
232
233
    # Test filtering single code
234
    featurizer = CountFeaturizer(excluded_event_filter=lambda e: e["code"] == "3")
235
    data = featurizer.generate_preprocess_data([patient], {patient["patient_id"]: labels})
236
    featurizer.encorperate_prepreprocessed_data([data])
237
238
    assert featurizer.get_num_columns() == 3
239
240
241
def test_count_bins_featurizer() -> None:
242
    time_horizon = TimeHorizon(datetime.timedelta(days=0), datetime.timedelta(days=180))
243
244
    dataset = femr_test_tools.create_patients_dataset(100)
245
    index = femr.index.PatientIndex(dataset)
246
247
    labeler = CodeLabeler(["2"], time_horizon, ["3"])
248
249
    patient: meds.Patient = dataset[0]
250
    labels = labeler.label(patient)
251
    time_bins = [
252
        datetime.timedelta(days=90),
253
        datetime.timedelta(days=180),
254
        datetime.timedelta(weeks=1e4),
255
    ]
256
    featurizer = CountFeaturizer(
257
        time_bins=time_bins,
258
    )
259
    data = featurizer.generate_preprocess_data([patient], {patient["patient_id"]: labels})
260
    featurizer.encorperate_prepreprocessed_data([data])
261
262
    patient_features = featurizer.featurize(patient, labels)
263
264
    assert featurizer.get_num_columns() == 12
265
266
    simple_patient_features = [{(featurizer.get_column_name(v.column), v.value) for v in a} for a in patient_features]
267
268
    assert simple_patient_features[0] == {
269
        ("SNOMED/184099003_70000 days, 0:00:00", 1),
270
        ("3_90 days, 0:00:00", 1),
271
    }
272
    assert simple_patient_features[1] == {
273
        ("3_90 days, 0:00:00", 1),
274
        ("SNOMED/184099003_70000 days, 0:00:00", 1),
275
        ("3_70000 days, 0:00:00", 1),
276
        ("2_70000 days, 0:00:00", 2),
277
    }
278
    assert simple_patient_features[2] == {
279
        ("2_70000 days, 0:00:00", 2),
280
        ("2_90 days, 0:00:00", 2),
281
        ("SNOMED/184099003_70000 days, 0:00:00", 1),
282
        ("3_90 days, 0:00:00", 1),
283
        ("3_70000 days, 0:00:00", 2),
284
    }
285
286
    all_labels = labeler.apply(dataset)
287
288
    time_bins = [
289
        datetime.timedelta(days=90),
290
        datetime.timedelta(days=180),
291
        datetime.timedelta(weeks=1e4),
292
    ]
293
    featurizer = CountFeaturizer(
294
        time_bins=time_bins,
295
    )
296
    featurizer_list = FeaturizerList([featurizer])
297
    featurizer_list.preprocess_featurizers(dataset, index, all_labels)
298
    featurized_patients = featurizer_list.featurize(dataset, index, all_labels)
299
300
    _assert_featurized_patients_structure(all_labels, featurized_patients)
301
302
303
def test_complete_featurization() -> None:
304
    time_horizon = TimeHorizon(datetime.timedelta(days=0), datetime.timedelta(days=180))
305
306
    dataset = femr_test_tools.create_patients_dataset(100)
307
    index = femr.index.PatientIndex(dataset)
308
309
    labeler = CodeLabeler(["2"], time_horizon, ["3"])
310
311
    all_labels = labeler.apply(dataset)
312
313
    age_featurizer = AgeFeaturizer(is_normalize=True)
314
    age_featurizer_list = FeaturizerList([age_featurizer])
315
    age_featurizer_list.preprocess_featurizers(dataset, index, all_labels)
316
    age_featurized_patients = age_featurizer_list.featurize(dataset, index, all_labels)
317
318
    time_bins = [
319
        datetime.timedelta(days=90),
320
        datetime.timedelta(days=180),
321
        datetime.timedelta(weeks=1e5),
322
    ]
323
    count_featurizer = CountFeaturizer(time_bins=time_bins)
324
    count_featurizer_list = FeaturizerList([count_featurizer])
325
    count_featurizer_list.preprocess_featurizers(dataset, index, all_labels)
326
    count_featurized_patients = count_featurizer_list.featurize(dataset, index, all_labels)
327
328
    age_featurizer = AgeFeaturizer(is_normalize=True)
329
    time_bins = [
330
        datetime.timedelta(days=90),
331
        datetime.timedelta(days=180),
332
        datetime.timedelta(weeks=1e5),
333
    ]
334
    count_featurizer = CountFeaturizer(time_bins=time_bins)
335
    featurizer_list = FeaturizerList([age_featurizer, count_featurizer])
336
    featurizer_list.preprocess_featurizers(dataset, index, all_labels)
337
    featurized_patients = featurizer_list.featurize(dataset, index, all_labels)
338
339
    assert featurized_patients["patient_ids"].shape == count_featurized_patients["patient_ids"].shape
340
341
    the_same = (
342
        featurized_patients["features"].toarray()
343
        == scipy.sparse.hstack((age_featurized_patients["features"], count_featurized_patients["features"])).toarray()
344
    )
345
346
    assert the_same.all()