|
a |
|
b/tests/matchers/test_regex.py |
|
|
1 |
import re |
|
|
2 |
|
|
|
3 |
import pytest |
|
|
4 |
from helpers import make_nlp |
|
|
5 |
from pytest import mark |
|
|
6 |
|
|
|
7 |
from edsnlp.matchers.regex import RegexMatcher, create_span |
|
|
8 |
from edsnlp.matchers.utils import get_text |
|
|
9 |
|
|
|
10 |
|
|
|
11 |
def test_regex(doc): |
|
|
12 |
matcher = RegexMatcher() |
|
|
13 |
|
|
|
14 |
matcher.add("test", [r"test"]) |
|
|
15 |
matcher.remove("test") |
|
|
16 |
|
|
|
17 |
matcher.add("patient", [r"patient"]) |
|
|
18 |
|
|
|
19 |
matches = matcher(doc, as_spans=False) |
|
|
20 |
|
|
|
21 |
for _, start, end in matcher(doc, as_spans=False): |
|
|
22 |
assert len(doc[start:end]) |
|
|
23 |
|
|
|
24 |
matches = matcher(doc[:10]) |
|
|
25 |
|
|
|
26 |
assert list(matches) |
|
|
27 |
|
|
|
28 |
|
|
|
29 |
@mark.parametrize( |
|
|
30 |
"pattern, txt, span_from_group, result", |
|
|
31 |
[ |
|
|
32 |
( |
|
|
33 |
r"match1 (?:group1|(group2))", # pattern |
|
|
34 |
"It is a match1 group1", # txt |
|
|
35 |
True, # span_from_group |
|
|
36 |
"match1 group1", # result |
|
|
37 |
), |
|
|
38 |
( |
|
|
39 |
r"match1 (?:group1|(group2))", |
|
|
40 |
"It is a match1 group1", |
|
|
41 |
False, |
|
|
42 |
"match1 group1", |
|
|
43 |
), |
|
|
44 |
( |
|
|
45 |
r"match1 (?:group1|(group2))", |
|
|
46 |
"It is a match1 group2", |
|
|
47 |
True, |
|
|
48 |
"group2", |
|
|
49 |
), |
|
|
50 |
( |
|
|
51 |
r"match1 (?:group1|(group2))", |
|
|
52 |
"It is a match1 group2", |
|
|
53 |
False, |
|
|
54 |
"match1 group2", |
|
|
55 |
), |
|
|
56 |
], |
|
|
57 |
) |
|
|
58 |
def test_regex_with_groups(blank_nlp, pattern, txt, span_from_group, result): |
|
|
59 |
doc = blank_nlp(txt) |
|
|
60 |
matcher = RegexMatcher(span_from_group=span_from_group) |
|
|
61 |
matcher.add("test", [pattern]) |
|
|
62 |
match = list(matcher(doc, as_spans=True))[0].text |
|
|
63 |
assert match == result |
|
|
64 |
|
|
|
65 |
|
|
|
66 |
def test_regex_with_norm(blank_nlp): |
|
|
67 |
blank_nlp.add_pipe("pollution") |
|
|
68 |
|
|
|
69 |
text = "pneumopathie à NBNbWbWbNbWbNBNbNbWbWbNBNbWbNbNbWbNBNbWbNbNB coronavirus" |
|
|
70 |
|
|
|
71 |
doc = blank_nlp(text) |
|
|
72 |
|
|
|
73 |
matcher = RegexMatcher(ignore_excluded=True) |
|
|
74 |
matcher.add("test", ["pneumopathie à coronavirus"]) |
|
|
75 |
|
|
|
76 |
match = list(matcher(doc, as_spans=True))[0] |
|
|
77 |
assert match.text == text |
|
|
78 |
assert match._.normalized_variant == "pneumopathie à coronavirus" |
|
|
79 |
|
|
|
80 |
|
|
|
81 |
def test_regex_with_norm_on_span(blank_nlp): |
|
|
82 |
blank_nlp.add_pipe("pollution") |
|
|
83 |
|
|
|
84 |
text = ( |
|
|
85 |
"le patient a une pneumopathie à NBNbWbWbNbWbNBNbNbWbWbNBNbWbNbNbWbNBNbWbNbNB" |
|
|
86 |
" coronavirus" |
|
|
87 |
) |
|
|
88 |
|
|
|
89 |
for offset in (0, 2): |
|
|
90 |
doc = blank_nlp(text)[offset:] |
|
|
91 |
|
|
|
92 |
matcher = RegexMatcher(ignore_excluded=True) |
|
|
93 |
matcher.add("test", ["pneumopathie à coronavirus"]) |
|
|
94 |
|
|
|
95 |
match = list(matcher(doc, as_spans=True))[0] |
|
|
96 |
assert ( |
|
|
97 |
match.text |
|
|
98 |
== "pneumopathie à NBNbWbWbNbWbNBNbNbWbWbNBNbWbNbNbWbNBNbWbNbNB coronavirus" |
|
|
99 |
) |
|
|
100 |
assert match._.normalized_variant == "pneumopathie à coronavirus" |
|
|
101 |
|
|
|
102 |
|
|
|
103 |
def test_offset(blank_nlp): |
|
|
104 |
text = "Ceci est un test de matching" |
|
|
105 |
|
|
|
106 |
doc = blank_nlp(text) |
|
|
107 |
pattern = "matching" |
|
|
108 |
|
|
|
109 |
matcher = RegexMatcher(attr="TEXT") |
|
|
110 |
|
|
|
111 |
matcher.add("test", [pattern]) |
|
|
112 |
|
|
|
113 |
for _, start, end in matcher(doc): |
|
|
114 |
assert doc[start:end].text == pattern |
|
|
115 |
|
|
|
116 |
for span in matcher(doc, as_spans=True): |
|
|
117 |
span.text == pattern |
|
|
118 |
|
|
|
119 |
for _, start, end in matcher(doc[2:]): |
|
|
120 |
assert doc[2:][start:end].text == pattern |
|
|
121 |
|
|
|
122 |
for span in matcher(doc[2:], as_spans=True): |
|
|
123 |
span.text == pattern |
|
|
124 |
|
|
|
125 |
|
|
|
126 |
def test_remove(): |
|
|
127 |
matcher = RegexMatcher(attr="TEXT") |
|
|
128 |
|
|
|
129 |
matcher.add("test", ["pattern"]) |
|
|
130 |
matcher.add("test", ["pattern2"], attr="LOWER") |
|
|
131 |
|
|
|
132 |
assert len(matcher) == 1 |
|
|
133 |
|
|
|
134 |
with pytest.raises(ValueError): |
|
|
135 |
matcher.remove("wrong_key") |
|
|
136 |
|
|
|
137 |
matcher.remove("test") |
|
|
138 |
|
|
|
139 |
assert len(matcher) == 0 |
|
|
140 |
|
|
|
141 |
|
|
|
142 |
def test_norm_alignment(blank_nlp): |
|
|
143 |
text = "test " + "bla… " * 4 + "test " + "bla" * 10 |
|
|
144 |
|
|
|
145 |
blank_nlp.add_pipe( |
|
|
146 |
"matcher", config=dict(regex=dict(test=r"\btest\b"), attr="NORM") |
|
|
147 |
) |
|
|
148 |
|
|
|
149 |
doc = blank_nlp(text) |
|
|
150 |
|
|
|
151 |
for ent in doc.ents: |
|
|
152 |
assert ent.text == "test" |
|
|
153 |
|
|
|
154 |
|
|
|
155 |
@mark.parametrize( |
|
|
156 |
"leading_text", |
|
|
157 |
[ |
|
|
158 |
"", |
|
|
159 |
"\n", |
|
|
160 |
"Test de non-pollution", |
|
|
161 |
], |
|
|
162 |
) |
|
|
163 |
@mark.parametrize("leading_pollution", [True, False]) |
|
|
164 |
@mark.parametrize("pollution_within", [True, False]) |
|
|
165 |
@mark.parametrize("trailing_pollution", [True, False]) |
|
|
166 |
@mark.parametrize( |
|
|
167 |
"pollution", |
|
|
168 |
["==================", "======= ======= =======", "Nnnnnnnnnnnnn nnnnnn nnnnnnnn"], |
|
|
169 |
) |
|
|
170 |
def text_get_text( |
|
|
171 |
blank_nlp, |
|
|
172 |
leading_text: str, |
|
|
173 |
leading_pollution: bool, |
|
|
174 |
pollution_within: bool, |
|
|
175 |
trailing_pollution: bool, |
|
|
176 |
pollution: str, |
|
|
177 |
): |
|
|
178 |
if pollution_within: |
|
|
179 |
example = f"transplantation {pollution} cardiaque en 2000." |
|
|
180 |
else: |
|
|
181 |
example = "transplantation cardiaque en 2000." |
|
|
182 |
|
|
|
183 |
chunks = [] |
|
|
184 |
|
|
|
185 |
if leading_text: |
|
|
186 |
chunks.append(leading_text) |
|
|
187 |
if leading_pollution: |
|
|
188 |
chunks.append(pollution) |
|
|
189 |
|
|
|
190 |
chunks.append(example) |
|
|
191 |
|
|
|
192 |
if trailing_pollution: |
|
|
193 |
chunks.append(pollution) |
|
|
194 |
|
|
|
195 |
text = " ".join(chunks) |
|
|
196 |
|
|
|
197 |
blank_nlp.add_pipe("eds.normalizer", config=dict(pollution=True)) |
|
|
198 |
blank_nlp.add_pipe( |
|
|
199 |
"eds.matcher", |
|
|
200 |
config=dict( |
|
|
201 |
regex=dict(test="transplantation cardiaque"), |
|
|
202 |
attr="NORM", |
|
|
203 |
ignore_excluded=True, |
|
|
204 |
), |
|
|
205 |
) |
|
|
206 |
doc = blank_nlp(text) |
|
|
207 |
|
|
|
208 |
clean = get_text(doc, attr="NORM", ignore_excluded=True) |
|
|
209 |
if leading_text: |
|
|
210 |
assert clean == f"{leading_text.lower()} transplantation cardiaque en 2000." |
|
|
211 |
else: |
|
|
212 |
assert clean == "transplantation cardiaque en 2000." |
|
|
213 |
|
|
|
214 |
assert doc.ents |
|
|
215 |
assert doc.ents[0][0].text == "transplantation" |
|
|
216 |
|
|
|
217 |
clean = get_text(doc.ents[0], attr="NORM", ignore_excluded=True) |
|
|
218 |
assert clean == "transplantation cardiaque" |
|
|
219 |
|
|
|
220 |
|
|
|
221 |
def test_groupdict_as_spans(doc): |
|
|
222 |
matcher = RegexMatcher() |
|
|
223 |
|
|
|
224 |
matcher.add("test", [r"patient(?i:(?=.*(?P<cause>douleurs))?)"]) |
|
|
225 |
|
|
|
226 |
[(span0, gd0), (span1, gd1)] = list(matcher.match_with_groupdict_as_spans(doc)) |
|
|
227 |
assert span0.text == "patient" |
|
|
228 |
assert span1.text == "patient" |
|
|
229 |
assert len(gd0) == 1 and gd0["cause"].text == "douleurs" |
|
|
230 |
assert len(gd1) == 0 |
|
|
231 |
|
|
|
232 |
|
|
|
233 |
def test_regex_with_space(blank_nlp): |
|
|
234 |
blank_nlp.add_pipe("eds.spaces") |
|
|
235 |
|
|
|
236 |
text = "pneumopathie à coronavirus" |
|
|
237 |
|
|
|
238 |
doc = blank_nlp(text) |
|
|
239 |
|
|
|
240 |
matcher = RegexMatcher(ignore_space_tokens=False) |
|
|
241 |
matcher.add("test", ["pneumopathie à coronavirus"]) |
|
|
242 |
|
|
|
243 |
assert len(list(matcher(doc, as_spans=True))) == 0 |
|
|
244 |
|
|
|
245 |
matcher = RegexMatcher(ignore_space_tokens=True) |
|
|
246 |
matcher.add("test", ["pneumopathie à coronavirus"]) |
|
|
247 |
|
|
|
248 |
match = list(matcher(doc, as_spans=True))[0] |
|
|
249 |
assert match.text == text |
|
|
250 |
assert match._.normalized_variant == "pneumopathie à coronavirus" |
|
|
251 |
|
|
|
252 |
|
|
|
253 |
@pytest.fixture(scope="session") |
|
|
254 |
def doc2(lang): |
|
|
255 |
blank_nlp = make_nlp(lang) |
|
|
256 |
blank_nlp.add_pipe("eds.pollution") |
|
|
257 |
blank_nlp.add_pipe("eds.spaces") |
|
|
258 |
|
|
|
259 |
text = ( |
|
|
260 |
"-----------------------------------------------------------------------\n" |
|
|
261 |
"La ………… valeur est NBNbWbWbNbWbNBNb de 24 / 30 milli\n" |
|
|
262 |
"grammes." |
|
|
263 |
) |
|
|
264 |
|
|
|
265 |
doc = blank_nlp(text) |
|
|
266 |
|
|
|
267 |
return doc |
|
|
268 |
|
|
|
269 |
|
|
|
270 |
@mark.parametrize("ignore_excluded", [True, False]) |
|
|
271 |
@mark.parametrize("ignore_space_tokens", [True, False]) |
|
|
272 |
@mark.parametrize("attr", ["TEXT", "NORM"]) |
|
|
273 |
@mark.parametrize("full_doc", [True, False]) |
|
|
274 |
def test_create_span( |
|
|
275 |
doc2, |
|
|
276 |
ignore_excluded: bool, |
|
|
277 |
ignore_space_tokens: bool, |
|
|
278 |
attr: str, |
|
|
279 |
full_doc: bool, |
|
|
280 |
): |
|
|
281 |
sent = list(doc2.sents)[1] |
|
|
282 |
doclike = doc2 if full_doc else sent |
|
|
283 |
|
|
|
284 |
matched_text = get_text( |
|
|
285 |
doclike, |
|
|
286 |
attr=attr, |
|
|
287 |
ignore_excluded=ignore_excluded, |
|
|
288 |
ignore_space_tokens=ignore_space_tokens, |
|
|
289 |
) |
|
|
290 |
clean_tokens = [ |
|
|
291 |
t |
|
|
292 |
for t in doclike |
|
|
293 |
if not ( |
|
|
294 |
(ignore_excluded and t.tag_ == "EXCLUDED") |
|
|
295 |
or (ignore_space_tokens and t.tag_ == "SPACE") |
|
|
296 |
) |
|
|
297 |
] |
|
|
298 |
filtered_original = doc2[clean_tokens[0].i : clean_tokens[-1].i + 1].text |
|
|
299 |
for pattern, result, alignment_mode in [ |
|
|
300 |
(r"4 / 3", "24 / 30", "expand"), |
|
|
301 |
(r"4 / 3", None, "strict"), |
|
|
302 |
(r"4 / 3", "/", "contract"), |
|
|
303 |
(r"24 / 30", "24 / 30", "expand"), |
|
|
304 |
(r"24 / 30", "24 / 30", "strict"), |
|
|
305 |
(r"24 / 30", "24 / 30", "contract"), |
|
|
306 |
(r"24 / 30 milli\s?gra", "24 / 30 milli\ngrammes", "expand"), |
|
|
307 |
(r"24 / 30 milli\s?gra", None, "strict"), |
|
|
308 |
(r"24 / 30 milli\s?gra", "24 / 30 milli\n", "contract"), |
|
|
309 |
(r" 24 / 30 ", "24 / 30", "expand"), |
|
|
310 |
(r" 24 / 30 ", None, "strict"), |
|
|
311 |
(r" 24 / 30 ", "24 / 30", "contract"), |
|
|
312 |
(matched_text, filtered_original, "expand"), |
|
|
313 |
(matched_text, filtered_original, "contract"), |
|
|
314 |
(matched_text, filtered_original, "strict"), |
|
|
315 |
("(?=4 / 3)", "24", "expand"), |
|
|
316 |
("(?=4 / 3)", None, "contract"), # spacy behavior, but it's not ideal |
|
|
317 |
("(?=4 / 3)", None, "strict"), |
|
|
318 |
("(?=24)", "", "expand"), |
|
|
319 |
("(?=24)", None, "contract"), # spacy behavior, but it's not ideal |
|
|
320 |
("(?=24)", None, "strict"), |
|
|
321 |
]: |
|
|
322 |
match = re.search(pattern, matched_text) |
|
|
323 |
span = create_span( |
|
|
324 |
doclike, |
|
|
325 |
start_char=match.start(), |
|
|
326 |
end_char=match.end(), |
|
|
327 |
key="value", |
|
|
328 |
attr=attr, |
|
|
329 |
alignment_mode=alignment_mode, |
|
|
330 |
ignore_excluded=ignore_excluded, |
|
|
331 |
ignore_space_tokens=ignore_space_tokens, |
|
|
332 |
) |
|
|
333 |
assert (None if span is None else span.text) == result, ( |
|
|
334 |
pattern, |
|
|
335 |
result, |
|
|
336 |
alignment_mode, |
|
|
337 |
) |
|
|
338 |
|
|
|
339 |
|
|
|
340 |
def test_create_empty_span(blank_nlp): |
|
|
341 |
blank_nlp.add_pipe("eds.pollution") |
|
|
342 |
blank_nlp.add_pipe("eds.spaces") |
|
|
343 |
doc = blank_nlp("plan des addictions:\ntabac :0") |
|
|
344 |
|
|
|
345 |
span = create_span( |
|
|
346 |
doc[5:], |
|
|
347 |
0, |
|
|
348 |
0, |
|
|
349 |
"empty", |
|
|
350 |
attr="NORM", |
|
|
351 |
alignment_mode="expand", |
|
|
352 |
ignore_excluded=True, |
|
|
353 |
ignore_space_tokens=True, |
|
|
354 |
) |
|
|
355 |
assert span.start == 5 and span.end == 5 |
|
|
356 |
|
|
|
357 |
|
|
|
358 |
def test_empty_get_text(blank_nlp): |
|
|
359 |
blank_nlp.add_pipe("eds.pollution") |
|
|
360 |
blank_nlp.add_pipe("eds.spaces") |
|
|
361 |
doc = blank_nlp("==================================") |
|
|
362 |
clean = get_text(doc, attr="NORM", ignore_excluded=True, ignore_space_tokens=True) |
|
|
363 |
assert clean == "" |