Switch to unified view

a b/tests/matchers/test_regex.py
1
import re
2
3
import pytest
4
from helpers import make_nlp
5
from pytest import mark
6
7
from edsnlp.matchers.regex import RegexMatcher, create_span
8
from edsnlp.matchers.utils import get_text
9
10
11
def test_regex(doc):
12
    matcher = RegexMatcher()
13
14
    matcher.add("test", [r"test"])
15
    matcher.remove("test")
16
17
    matcher.add("patient", [r"patient"])
18
19
    matches = matcher(doc, as_spans=False)
20
21
    for _, start, end in matcher(doc, as_spans=False):
22
        assert len(doc[start:end])
23
24
    matches = matcher(doc[:10])
25
26
    assert list(matches)
27
28
29
@mark.parametrize(
30
    "pattern, txt, span_from_group, result",
31
    [
32
        (
33
            r"match1 (?:group1|(group2))",  # pattern
34
            "It is a match1 group1",  # txt
35
            True,  # span_from_group
36
            "match1 group1",  # result
37
        ),
38
        (
39
            r"match1 (?:group1|(group2))",
40
            "It is a match1 group1",
41
            False,
42
            "match1 group1",
43
        ),
44
        (
45
            r"match1 (?:group1|(group2))",
46
            "It is a match1 group2",
47
            True,
48
            "group2",
49
        ),
50
        (
51
            r"match1 (?:group1|(group2))",
52
            "It is a match1 group2",
53
            False,
54
            "match1 group2",
55
        ),
56
    ],
57
)
58
def test_regex_with_groups(blank_nlp, pattern, txt, span_from_group, result):
59
    doc = blank_nlp(txt)
60
    matcher = RegexMatcher(span_from_group=span_from_group)
61
    matcher.add("test", [pattern])
62
    match = list(matcher(doc, as_spans=True))[0].text
63
    assert match == result
64
65
66
def test_regex_with_norm(blank_nlp):
67
    blank_nlp.add_pipe("pollution")
68
69
    text = "pneumopathie à NBNbWbWbNbWbNBNbNbWbWbNBNbWbNbNbWbNBNbWbNbNB coronavirus"
70
71
    doc = blank_nlp(text)
72
73
    matcher = RegexMatcher(ignore_excluded=True)
74
    matcher.add("test", ["pneumopathie à coronavirus"])
75
76
    match = list(matcher(doc, as_spans=True))[0]
77
    assert match.text == text
78
    assert match._.normalized_variant == "pneumopathie à coronavirus"
79
80
81
def test_regex_with_norm_on_span(blank_nlp):
82
    blank_nlp.add_pipe("pollution")
83
84
    text = (
85
        "le patient a une pneumopathie à NBNbWbWbNbWbNBNbNbWbWbNBNbWbNbNbWbNBNbWbNbNB"
86
        " coronavirus"
87
    )
88
89
    for offset in (0, 2):
90
        doc = blank_nlp(text)[offset:]
91
92
        matcher = RegexMatcher(ignore_excluded=True)
93
        matcher.add("test", ["pneumopathie à coronavirus"])
94
95
        match = list(matcher(doc, as_spans=True))[0]
96
        assert (
97
            match.text
98
            == "pneumopathie à NBNbWbWbNbWbNBNbNbWbWbNBNbWbNbNbWbNBNbWbNbNB coronavirus"
99
        )
100
        assert match._.normalized_variant == "pneumopathie à coronavirus"
101
102
103
def test_offset(blank_nlp):
104
    text = "Ceci est un test de matching"
105
106
    doc = blank_nlp(text)
107
    pattern = "matching"
108
109
    matcher = RegexMatcher(attr="TEXT")
110
111
    matcher.add("test", [pattern])
112
113
    for _, start, end in matcher(doc):
114
        assert doc[start:end].text == pattern
115
116
    for span in matcher(doc, as_spans=True):
117
        span.text == pattern
118
119
    for _, start, end in matcher(doc[2:]):
120
        assert doc[2:][start:end].text == pattern
121
122
    for span in matcher(doc[2:], as_spans=True):
123
        span.text == pattern
124
125
126
def test_remove():
127
    matcher = RegexMatcher(attr="TEXT")
128
129
    matcher.add("test", ["pattern"])
130
    matcher.add("test", ["pattern2"], attr="LOWER")
131
132
    assert len(matcher) == 1
133
134
    with pytest.raises(ValueError):
135
        matcher.remove("wrong_key")
136
137
    matcher.remove("test")
138
139
    assert len(matcher) == 0
140
141
142
def test_norm_alignment(blank_nlp):
143
    text = "test " + "bla… " * 4 + "test " + "bla" * 10
144
145
    blank_nlp.add_pipe(
146
        "matcher", config=dict(regex=dict(test=r"\btest\b"), attr="NORM")
147
    )
148
149
    doc = blank_nlp(text)
150
151
    for ent in doc.ents:
152
        assert ent.text == "test"
153
154
155
@mark.parametrize(
156
    "leading_text",
157
    [
158
        "",
159
        "\n",
160
        "Test de non-pollution",
161
    ],
162
)
163
@mark.parametrize("leading_pollution", [True, False])
164
@mark.parametrize("pollution_within", [True, False])
165
@mark.parametrize("trailing_pollution", [True, False])
166
@mark.parametrize(
167
    "pollution",
168
    ["==================", "======= ======= =======", "Nnnnnnnnnnnnn nnnnnn nnnnnnnn"],
169
)
170
def text_get_text(
171
    blank_nlp,
172
    leading_text: str,
173
    leading_pollution: bool,
174
    pollution_within: bool,
175
    trailing_pollution: bool,
176
    pollution: str,
177
):
178
    if pollution_within:
179
        example = f"transplantation {pollution} cardiaque en 2000."
180
    else:
181
        example = "transplantation cardiaque en 2000."
182
183
    chunks = []
184
185
    if leading_text:
186
        chunks.append(leading_text)
187
    if leading_pollution:
188
        chunks.append(pollution)
189
190
    chunks.append(example)
191
192
    if trailing_pollution:
193
        chunks.append(pollution)
194
195
    text = " ".join(chunks)
196
197
    blank_nlp.add_pipe("eds.normalizer", config=dict(pollution=True))
198
    blank_nlp.add_pipe(
199
        "eds.matcher",
200
        config=dict(
201
            regex=dict(test="transplantation cardiaque"),
202
            attr="NORM",
203
            ignore_excluded=True,
204
        ),
205
    )
206
    doc = blank_nlp(text)
207
208
    clean = get_text(doc, attr="NORM", ignore_excluded=True)
209
    if leading_text:
210
        assert clean == f"{leading_text.lower()} transplantation cardiaque en 2000."
211
    else:
212
        assert clean == "transplantation cardiaque en 2000."
213
214
    assert doc.ents
215
    assert doc.ents[0][0].text == "transplantation"
216
217
    clean = get_text(doc.ents[0], attr="NORM", ignore_excluded=True)
218
    assert clean == "transplantation cardiaque"
219
220
221
def test_groupdict_as_spans(doc):
222
    matcher = RegexMatcher()
223
224
    matcher.add("test", [r"patient(?i:(?=.*(?P<cause>douleurs))?)"])
225
226
    [(span0, gd0), (span1, gd1)] = list(matcher.match_with_groupdict_as_spans(doc))
227
    assert span0.text == "patient"
228
    assert span1.text == "patient"
229
    assert len(gd0) == 1 and gd0["cause"].text == "douleurs"
230
    assert len(gd1) == 0
231
232
233
def test_regex_with_space(blank_nlp):
234
    blank_nlp.add_pipe("eds.spaces")
235
236
    text = "pneumopathie à      coronavirus"
237
238
    doc = blank_nlp(text)
239
240
    matcher = RegexMatcher(ignore_space_tokens=False)
241
    matcher.add("test", ["pneumopathie à coronavirus"])
242
243
    assert len(list(matcher(doc, as_spans=True))) == 0
244
245
    matcher = RegexMatcher(ignore_space_tokens=True)
246
    matcher.add("test", ["pneumopathie à coronavirus"])
247
248
    match = list(matcher(doc, as_spans=True))[0]
249
    assert match.text == text
250
    assert match._.normalized_variant == "pneumopathie à coronavirus"
251
252
253
@pytest.fixture(scope="session")
254
def doc2(lang):
255
    blank_nlp = make_nlp(lang)
256
    blank_nlp.add_pipe("eds.pollution")
257
    blank_nlp.add_pipe("eds.spaces")
258
259
    text = (
260
        "-----------------------------------------------------------------------\n"
261
        "La ………… valeur est NBNbWbWbNbWbNBNb de 24 / 30 milli\n"
262
        "grammes."
263
    )
264
265
    doc = blank_nlp(text)
266
267
    return doc
268
269
270
@mark.parametrize("ignore_excluded", [True, False])
271
@mark.parametrize("ignore_space_tokens", [True, False])
272
@mark.parametrize("attr", ["TEXT", "NORM"])
273
@mark.parametrize("full_doc", [True, False])
274
def test_create_span(
275
    doc2,
276
    ignore_excluded: bool,
277
    ignore_space_tokens: bool,
278
    attr: str,
279
    full_doc: bool,
280
):
281
    sent = list(doc2.sents)[1]
282
    doclike = doc2 if full_doc else sent
283
284
    matched_text = get_text(
285
        doclike,
286
        attr=attr,
287
        ignore_excluded=ignore_excluded,
288
        ignore_space_tokens=ignore_space_tokens,
289
    )
290
    clean_tokens = [
291
        t
292
        for t in doclike
293
        if not (
294
            (ignore_excluded and t.tag_ == "EXCLUDED")
295
            or (ignore_space_tokens and t.tag_ == "SPACE")
296
        )
297
    ]
298
    filtered_original = doc2[clean_tokens[0].i : clean_tokens[-1].i + 1].text
299
    for pattern, result, alignment_mode in [
300
        (r"4 / 3", "24 / 30", "expand"),
301
        (r"4 / 3", None, "strict"),
302
        (r"4 / 3", "/", "contract"),
303
        (r"24 / 30", "24 / 30", "expand"),
304
        (r"24 / 30", "24 / 30", "strict"),
305
        (r"24 / 30", "24 / 30", "contract"),
306
        (r"24 / 30 milli\s?gra", "24 / 30 milli\ngrammes", "expand"),
307
        (r"24 / 30 milli\s?gra", None, "strict"),
308
        (r"24 / 30 milli\s?gra", "24 / 30 milli\n", "contract"),
309
        (r" 24 / 30 ", "24 / 30", "expand"),
310
        (r" 24 / 30 ", None, "strict"),
311
        (r" 24 / 30 ", "24 / 30", "contract"),
312
        (matched_text, filtered_original, "expand"),
313
        (matched_text, filtered_original, "contract"),
314
        (matched_text, filtered_original, "strict"),
315
        ("(?=4 / 3)", "24", "expand"),
316
        ("(?=4 / 3)", None, "contract"),  # spacy behavior, but it's not ideal
317
        ("(?=4 / 3)", None, "strict"),
318
        ("(?=24)", "", "expand"),
319
        ("(?=24)", None, "contract"),  # spacy behavior, but it's not ideal
320
        ("(?=24)", None, "strict"),
321
    ]:
322
        match = re.search(pattern, matched_text)
323
        span = create_span(
324
            doclike,
325
            start_char=match.start(),
326
            end_char=match.end(),
327
            key="value",
328
            attr=attr,
329
            alignment_mode=alignment_mode,
330
            ignore_excluded=ignore_excluded,
331
            ignore_space_tokens=ignore_space_tokens,
332
        )
333
        assert (None if span is None else span.text) == result, (
334
            pattern,
335
            result,
336
            alignment_mode,
337
        )
338
339
340
def test_create_empty_span(blank_nlp):
341
    blank_nlp.add_pipe("eds.pollution")
342
    blank_nlp.add_pipe("eds.spaces")
343
    doc = blank_nlp("plan des addictions:\ntabac :0")
344
345
    span = create_span(
346
        doc[5:],
347
        0,
348
        0,
349
        "empty",
350
        attr="NORM",
351
        alignment_mode="expand",
352
        ignore_excluded=True,
353
        ignore_space_tokens=True,
354
    )
355
    assert span.start == 5 and span.end == 5
356
357
358
def test_empty_get_text(blank_nlp):
359
    blank_nlp.add_pipe("eds.pollution")
360
    blank_nlp.add_pipe("eds.spaces")
361
    doc = blank_nlp("==================================")
362
    clean = get_text(doc, attr="NORM", ignore_excluded=True, ignore_space_tokens=True)
363
    assert clean == ""