Switch to unified view

a b/deduce/lookup_struct_loader.py
1
"""Some functions for creating lookup structures from raw items."""
2
3
import docdeid as dd
4
from docdeid import Tokenizer
5
6
from deduce.str import FilterBasedOnLookupSet, TitleCase, UpperCase, UpperCaseFirstChar
7
from deduce.utils import lookup_set_to_trie
8
9
10
def load_common_word_lookup(raw_itemsets: dict[str, set[str]]) -> dd.ds.LookupSet:
11
    """Load common_word LookupSet."""
12
13
    common_word = dd.ds.LookupSet()
14
    common_word.add_items_from_iterable(
15
        raw_itemsets["common_word"],
16
    )
17
18
    surnames_lowercase = dd.ds.LookupSet()
19
    surnames_lowercase.add_items_from_iterable(
20
        raw_itemsets["surname"],
21
        cleaning_pipeline=[
22
            dd.str.LowercaseString(),
23
            dd.str.FilterByLength(min_len=2),
24
        ],
25
    )
26
27
    common_word -= surnames_lowercase
28
29
    return common_word
30
31
32
def load_whitelist_lookup(raw_itemsets: dict[str, set[str]]) -> dd.ds.LookupSet:
33
    """
34
    Load whitelist LookupSet.
35
36
    Composed of medical terms, top 1000 frequent words (except surnames), and stopwords.
37
    """
38
    medical_term = dd.ds.LookupSet()
39
40
    medical_term.add_items_from_iterable(
41
        raw_itemsets["medical_term"],
42
    )
43
44
    common_word = load_common_word_lookup(raw_itemsets)
45
46
    stop_word = dd.ds.LookupSet()
47
    stop_word.add_items_from_iterable(raw_itemsets["stop_word"])
48
49
    whitelist = dd.ds.LookupSet(matching_pipeline=[dd.str.LowercaseString()])
50
    whitelist.add_items_from_iterable(
51
        medical_term + common_word + stop_word,
52
        cleaning_pipeline=[dd.str.FilterByLength(min_len=2)],
53
    )
54
55
    return whitelist
56
57
58
def load_eponymous_disease_lookup(
59
    raw_itemsets: dict[str, set[str]], tokenizer: Tokenizer
60
) -> dd.ds.LookupTrie:
61
    """Loads eponymous disease LookupTrie (e.g. Henoch-Schonlein)."""
62
    epo_disease = dd.ds.LookupSet()
63
    epo_disease.add_items_from_iterable(raw_itemsets["eponymous_disease"])
64
    epo_disease.add_items_from_self(
65
        cleaning_pipeline=[dd.str.ReplaceNonAsciiCharacters()]
66
    )
67
68
    return lookup_set_to_trie(epo_disease, tokenizer)
69
70
71
def load_prefix_lookup(raw_itemsets: dict[str, set[str]]) -> dd.ds.LookupSet:
72
    """Load prefix LookupSet (e.g. 'dr', 'mw')."""
73
74
    prefix = dd.ds.LookupSet()
75
76
    prefix.add_items_from_iterable(raw_itemsets["prefix"])
77
    prefix.add_items_from_self(cleaning_pipeline=[UpperCaseFirstChar()])
78
79
    return prefix
80
81
82
def load_first_name_lookup(
83
    raw_itemsets: dict[str, set[str]], tokenizer: Tokenizer
84
) -> dd.ds.LookupTrie:
85
    """Load first_name LookupTrie."""
86
87
    first_name = dd.ds.LookupSet()
88
89
    first_name.add_items_from_iterable(
90
        raw_itemsets["first_name"],
91
        cleaning_pipeline=[dd.str.FilterByLength(min_len=2)],
92
    )
93
94
    first_name.add_items_from_self(
95
        cleaning_pipeline=[
96
            FilterBasedOnLookupSet(
97
                filter_set=load_whitelist_lookup(raw_itemsets), case_sensitive=False
98
            ),
99
        ],
100
        replace=True,
101
    )
102
103
    return lookup_set_to_trie(first_name, tokenizer)
104
105
106
def load_interfix_lookup(raw_itemsets: dict[str, set[str]]) -> dd.ds.LookupSet:
107
    """Load interfix LookupSet ('van der', etc.)."""
108
109
    interfix = dd.ds.LookupSet()
110
111
    interfix.add_items_from_iterable(raw_itemsets["interfix"])
112
    interfix.add_items_from_self(cleaning_pipeline=[UpperCaseFirstChar()])
113
    interfix.add_items_from_self(cleaning_pipeline=[TitleCase()])
114
    interfix.remove_items_from_iterable(["V."])
115
116
    return interfix
117
118
119
def load_surname_lookup(
120
    raw_itemsets: dict[str, set[str]], tokenizer: Tokenizer
121
) -> dd.ds.LookupTrie:
122
    """Load surname LookupTrie."""
123
124
    surname = dd.ds.LookupSet()
125
126
    surname.add_items_from_iterable(
127
        raw_itemsets["surname"],
128
        cleaning_pipeline=[dd.str.FilterByLength(min_len=2)],
129
    )
130
131
    surname.add_items_from_self(
132
        cleaning_pipeline=[
133
            FilterBasedOnLookupSet(
134
                filter_set=load_whitelist_lookup(raw_itemsets), case_sensitive=False
135
            ),
136
        ],
137
        replace=True,
138
    )
139
140
    return lookup_set_to_trie(surname, tokenizer)
141
142
143
def load_street_lookup(
144
    raw_itemsets: dict[str, set[str]], tokenizer: Tokenizer
145
) -> dd.ds.LookupTrie:
146
    """Load street LookupTrie."""
147
148
    street = dd.ds.LookupSet()
149
150
    street.add_items_from_iterable(
151
        raw_itemsets["street"],
152
        cleaning_pipeline=[
153
            dd.str.StripString(),
154
            dd.str.FilterByLength(min_len=4),
155
        ],
156
    )
157
158
    street.add_items_from_self(cleaning_pipeline=[dd.str.ReplaceNonAsciiCharacters()])
159
160
    return lookup_set_to_trie(street, tokenizer)
161
162
163
def load_placename_lookup(
164
    raw_itemsets: dict[str, set[str]], tokenizer: Tokenizer
165
) -> dd.ds.LookupTrie:
166
    """Load placename LookupTrie."""
167
168
    placename = dd.ds.LookupSet()
169
170
    placename.add_items_from_iterable(
171
        raw_itemsets["placename"],
172
        cleaning_pipeline=[
173
            dd.str.StripString(),
174
        ],
175
    )
176
177
    placename.add_items_from_self(
178
        cleaning_pipeline=[dd.str.ReplaceNonAsciiCharacters()]
179
    )
180
181
    placename.add_items_from_self(
182
        cleaning_pipeline=[
183
            dd.str.ReplaceValue("(", ""),
184
            dd.str.ReplaceValue(")", ""),
185
            dd.str.ReplaceValue("  ", " "),
186
        ]
187
    )
188
189
    placename.add_items_from_self(cleaning_pipeline=[UpperCase()])
190
191
    placename.add_items_from_self(
192
        cleaning_pipeline=[
193
            FilterBasedOnLookupSet(
194
                filter_set=load_whitelist_lookup(raw_itemsets), case_sensitive=False
195
            ),
196
        ],
197
        replace=True,
198
    )
199
200
    return lookup_set_to_trie(placename, tokenizer)
201
202
203
def load_hospital_lookup(
204
    raw_itemsets: dict[str, set[str]], tokenizer: Tokenizer
205
) -> dd.ds.LookupTrie:
206
    """Load hopsital LookupTrie."""
207
208
    hospital = dd.ds.LookupSet(matching_pipeline=[dd.str.LowercaseString()])
209
210
    hospital.add_items_from_iterable(raw_itemsets["hospital"])
211
212
    hospital.add_items_from_iterable(raw_itemsets["hospital_abbr"])
213
214
    hospital.add_items_from_self(
215
        cleaning_pipeline=[dd.str.ReplaceNonAsciiCharacters()],
216
    )
217
218
    return lookup_set_to_trie(hospital, tokenizer)
219
220
221
def load_institution_lookup(
222
    raw_itemsets: dict[str, set[str]], tokenizer: Tokenizer
223
) -> dd.ds.LookupTrie:
224
    """Load institution LookupTrie."""
225
226
    institution = dd.ds.LookupSet()
227
    institution.add_items_from_iterable(
228
        raw_itemsets["healthcare_institution"],
229
        cleaning_pipeline=[dd.str.StripString(), dd.str.FilterByLength(min_len=4)],
230
    )
231
232
    institution.add_items_from_self(cleaning_pipeline=[UpperCase()])
233
234
    institution.add_items_from_self(
235
        cleaning_pipeline=[dd.str.ReplaceNonAsciiCharacters()],
236
    )
237
    institution = institution - load_whitelist_lookup(raw_itemsets)
238
239
    return lookup_set_to_trie(institution, tokenizer)