Diff of /tests/unit/test_utils.py [000000] .. [79668b]

Switch to unified view

a b/tests/unit/test_utils.py
1
from pathlib import Path
2
3
import docdeid as dd
4
import pytest
5
6
from deduce import utils
7
from deduce.annotator import TokenPatternAnnotator
8
9
10
class TestStrMatch:
11
    def test_str_match(self):
12
        assert utils.str_match("a", "a")
13
        assert utils.str_match("willem", "willem")
14
        assert not utils.str_match("a", "b")
15
        assert not utils.str_match("willem", "klaas")
16
17
    def test_str_match_fuzzy(self):
18
        assert utils.str_match("a", "a", max_edit_distance=1)
19
        assert utils.str_match("willem", "willem", max_edit_distance=1)
20
        assert utils.str_match("willem", "illem", max_edit_distance=1)
21
        assert utils.str_match("willem", "qwillem", max_edit_distance=1)
22
        assert utils.str_match("willem", "willme", max_edit_distance=1)
23
        assert utils.str_match("willem", "Willem", max_edit_distance=1)
24
25
        assert not utils.str_match("a", "abc", max_edit_distance=1)
26
        assert not utils.str_match("willem", "wilhelm", max_edit_distance=1)
27
        assert not utils.str_match("willem", "klaas", max_edit_distance=1)
28
29
30
class TestClassForName:
31
    def test_class_for_name(self):
32
        assert (
33
            utils.class_for_name(
34
                module_name="deduce.annotator", class_name="TokenPatternAnnotator"
35
            )
36
            == TokenPatternAnnotator
37
        )
38
39
40
class TestInitializeClass:
41
    def test_initialize_class(self):
42
43
        cls = TokenPatternAnnotator
44
45
        tag = "_"
46
        pattern = [{"key": "value"}]
47
48
        annotator = utils.initialize_class(
49
            cls, args={"tag": tag, "pattern": pattern}, extras={}
50
        )
51
52
        assert annotator.tag == tag
53
        assert annotator.pattern == pattern
54
55
    def test_initialize_class_with_extras(self):
56
57
        cls = TokenPatternAnnotator
58
59
        tag = "_"
60
        pattern = [{"key": "value"}]
61
        ds = dd.ds.DsCollection()
62
63
        annotator = utils.initialize_class(
64
            cls,
65
            args={"tag": tag, "pattern": pattern},
66
            extras={"ds": ds, "unused_argument": "_"},
67
        )
68
69
        assert annotator.tag == tag
70
        assert annotator.pattern == pattern
71
        assert annotator.ds is ds
72
73
74
class TestOverwriteDict:
75
    def test_empty(self):
76
        for add in [{}, {"a": 1}, {"a": 1, "b": {}}, {"a": 1, "b": {"c": 2}}]:
77
            assert utils.overwrite_dict({}, add) == add
78
79
    def test_nonempty_no_nesting(self):
80
        assert utils.overwrite_dict({"a": 1}, {"a": 1}) == {"a": 1}
81
        assert utils.overwrite_dict({"a": 1}, {"a": 2}) == {"a": 2}
82
        assert utils.overwrite_dict({"a": 1}, {"b": 2}) == {"a": 1, "b": 2}
83
84
    def test_nonempty_with_nesting(self):
85
        assert utils.overwrite_dict({"a": 1, "b": {"c": 2}}, {"b": {"c": 4}}) == {
86
            "a": 1,
87
            "b": {"c": 4},
88
        }
89
        assert utils.overwrite_dict({"a": 1, "b": {"c": 2}}, {"b": {"d": 4}}) == {
90
            "a": 1,
91
            "b": {"c": 2, "d": 4},
92
        }
93
94
95
class TestHasOverlap:
96
    def test_has_overlap(self):
97
98
        assert not utils.has_overlap([])
99
        assert not utils.has_overlap([(0, 10)])
100
        assert utils.has_overlap([(0, 10), (5, 15)])
101
        assert not utils.has_overlap([(0, 10), (10, 15)])
102
        assert not utils.has_overlap([(0, 10), (15, 25)])
103
        assert not utils.has_overlap([(15, 25), (0, 10)])
104
105
106
class TestStrVariations:
107
    def test_has_overlap(self):
108
109
        assert utils.has_overlap([(0, 10), (5, 14)])
110
        assert utils.has_overlap([(0, 10), (9, 15)])
111
        assert utils.has_overlap([(9, 15), (5, 10)])
112
        assert utils.has_overlap([(9, 15, True), (5, 10, False)])
113
        assert not utils.has_overlap([(0, 10), (10, 13)])
114
        assert not utils.has_overlap([(0, 10), (10, 10)])
115
        assert not utils.has_overlap([(0, 10, True), (10, 10, False)])
116
117
    def test_repl_none(self):
118
119
        s = "Prof. Lieflantlaan"
120
        matches = []
121
122
        segments = utils.repl_segments(s, matches)
123
124
        assert segments == [["Prof. Lieflantlaan"]]
125
126
    def test_repl_segments_single_to_single(self):
127
128
        s = "Prof. Lieflantlaan"
129
        matches = [(0, 5, ["Prof."])]
130
131
        segments = utils.repl_segments(s, matches)
132
133
        assert segments == [["Prof."], [" Lieflantlaan"]]
134
135
    def test_repl_segments_single_to_multiple(self):
136
137
        s = "Prof. Lieflantlaan"
138
        matches = [(0, 5, ["Prof.", "Professor"])]
139
140
        segments = utils.repl_segments(s, matches)
141
142
        assert segments == [["Prof.", "Professor"], [" Lieflantlaan"]]
143
144
    def test_repl_segments_multiple_to_multiple(self):
145
146
        s = "Prof. Lieflantlaan"
147
        matches = [(0, 5, ["Prof.", "Professor"]), (14, 18, ["laan", "ln"])]
148
149
        segments = utils.repl_segments(s, matches)
150
151
        assert segments == [["Prof.", "Professor"], [" Lieflant"], ["laan", "ln"]]
152
153
    def test_str_variations_no_matches(self):
154
155
        s = "Prof. Lieflantlaan"
156
        repl = {}
157
158
        variations = utils.str_variations(s, repl)
159
160
        assert variations == [s]
161
162
    def test_str_variations_overlap(self):
163
164
        s = "Prof. Lieflantlaan"
165
        repl = {"laan": ["laan", "ln"], "lantlaan": ["lantlaan", "lantln"]}
166
167
        with pytest.raises(RuntimeError):
168
            _ = utils.str_variations(s, repl)
169
170
    def test_str_variations_one_match(self):
171
172
        s = "Prof. Lieflantlaan"
173
        repl = {"Prof.": ["Prof.", "Professor"]}
174
175
        variations = utils.str_variations(s, repl)
176
177
        assert variations == ["Prof. Lieflantlaan", "Professor Lieflantlaan"]
178
179
    def test_str_variations_multiple_matches(self):
180
181
        s = "Prof. Lieflantlaan"
182
        repl = {"Prof.": ["Prof.", "Professor"], "laan": ["laan", "ln"]}
183
184
        variations = utils.str_variations(s, repl)
185
186
        assert variations == [
187
            "Prof. Lieflantlaan",
188
            "Professor Lieflantlaan",
189
            "Prof. Lieflantln",
190
            "Professor Lieflantln",
191
        ]
192
193
    def test_str_variations_regexp(self):
194
195
        s = "van Bevanstraat"
196
        repl = {"^van": ["Van", "van"]}
197
198
        variations = utils.str_variations(s, repl)
199
200
        assert variations == ["Van Bevanstraat", "van Bevanstraat"]
201
202
    def test_apply_transform(self):
203
204
        s = {"Prof. Lieflantlaan"}
205
        repl = {"Prof.": ["Prof.", "Professor"]}
206
207
        transform_config = {"transforms": {"prefix": repl}}
208
        variations = utils.apply_transform(s, transform_config)
209
210
        assert variations == {"Prof. Lieflantlaan", "Professor Lieflantlaan"}
211
212
    def test_apply_transform2(self):
213
214
        items = {"den Burg", "Rotterdam"}
215
        transform = {"transforms": {"name": {"den": ["den", ""]}}}
216
217
        transformed_items = utils.apply_transform(items, transform)
218
219
        assert transformed_items == {"den Burg", "Burg", "Rotterdam"}
220
221
    def test_apply_transform_no_strip_lines(self):
222
223
        items = {"den Burg", "Rotterdam"}
224
        transform = {"transforms": {"name": {"den": ["den", ""]}}, "strip_lines": False}
225
226
        transformed_items = utils.apply_transform(items, transform)
227
228
        assert transformed_items == {"den Burg", " Burg", "Rotterdam"}
229
230
231
class TestOptionalLoad:
232
    def test_optional_load_items(self):
233
234
        path = Path("tests/data/lookup/src/lst_test_nested/items.txt")
235
236
        assert utils.optional_load_items(path) == {"a", "b"}
237
238
    def test_optional_load_items_nonexisting(self):
239
240
        path = Path("tests/data/non/existing/file.txt")
241
242
        assert utils.optional_load_items(path) is None
243
244
    def test_optional_load_json(self):
245
246
        path = Path("tests/data/small.json")
247
248
        assert utils.optional_load_json(path) == {"test": True}
249
250
    def test_optional_load_json_nonexisting(self):
251
252
        path = Path("tests/data/non/existing/file.json")
253
254
        assert utils.optional_load_json(path) is None