import re
import pytest
from helpers import make_nlp
from pytest import mark
from edsnlp.matchers.regex import RegexMatcher, create_span
from edsnlp.matchers.utils import get_text
def test_regex(doc):
matcher = RegexMatcher()
matcher.add("test", [r"test"])
matcher.remove("test")
matcher.add("patient", [r"patient"])
matches = matcher(doc, as_spans=False)
for _, start, end in matcher(doc, as_spans=False):
assert len(doc[start:end])
matches = matcher(doc[:10])
assert list(matches)
@mark.parametrize(
"pattern, txt, span_from_group, result",
[
(
r"match1 (?:group1|(group2))", # pattern
"It is a match1 group1", # txt
True, # span_from_group
"match1 group1", # result
),
(
r"match1 (?:group1|(group2))",
"It is a match1 group1",
False,
"match1 group1",
),
(
r"match1 (?:group1|(group2))",
"It is a match1 group2",
True,
"group2",
),
(
r"match1 (?:group1|(group2))",
"It is a match1 group2",
False,
"match1 group2",
),
],
)
def test_regex_with_groups(blank_nlp, pattern, txt, span_from_group, result):
doc = blank_nlp(txt)
matcher = RegexMatcher(span_from_group=span_from_group)
matcher.add("test", [pattern])
match = list(matcher(doc, as_spans=True))[0].text
assert match == result
def test_regex_with_norm(blank_nlp):
blank_nlp.add_pipe("pollution")
text = "pneumopathie à NBNbWbWbNbWbNBNbNbWbWbNBNbWbNbNbWbNBNbWbNbNB coronavirus"
doc = blank_nlp(text)
matcher = RegexMatcher(ignore_excluded=True)
matcher.add("test", ["pneumopathie à coronavirus"])
match = list(matcher(doc, as_spans=True))[0]
assert match.text == text
assert match._.normalized_variant == "pneumopathie à coronavirus"
def test_regex_with_norm_on_span(blank_nlp):
blank_nlp.add_pipe("pollution")
text = (
"le patient a une pneumopathie à NBNbWbWbNbWbNBNbNbWbWbNBNbWbNbNbWbNBNbWbNbNB"
" coronavirus"
)
for offset in (0, 2):
doc = blank_nlp(text)[offset:]
matcher = RegexMatcher(ignore_excluded=True)
matcher.add("test", ["pneumopathie à coronavirus"])
match = list(matcher(doc, as_spans=True))[0]
assert (
match.text
== "pneumopathie à NBNbWbWbNbWbNBNbNbWbWbNBNbWbNbNbWbNBNbWbNbNB coronavirus"
)
assert match._.normalized_variant == "pneumopathie à coronavirus"
def test_offset(blank_nlp):
text = "Ceci est un test de matching"
doc = blank_nlp(text)
pattern = "matching"
matcher = RegexMatcher(attr="TEXT")
matcher.add("test", [pattern])
for _, start, end in matcher(doc):
assert doc[start:end].text == pattern
for span in matcher(doc, as_spans=True):
span.text == pattern
for _, start, end in matcher(doc[2:]):
assert doc[2:][start:end].text == pattern
for span in matcher(doc[2:], as_spans=True):
span.text == pattern
def test_remove():
matcher = RegexMatcher(attr="TEXT")
matcher.add("test", ["pattern"])
matcher.add("test", ["pattern2"], attr="LOWER")
assert len(matcher) == 1
with pytest.raises(ValueError):
matcher.remove("wrong_key")
matcher.remove("test")
assert len(matcher) == 0
def test_norm_alignment(blank_nlp):
text = "test " + "bla… " * 4 + "test " + "bla" * 10
blank_nlp.add_pipe(
"matcher", config=dict(regex=dict(test=r"\btest\b"), attr="NORM")
)
doc = blank_nlp(text)
for ent in doc.ents:
assert ent.text == "test"
@mark.parametrize(
"leading_text",
[
"",
"\n",
"Test de non-pollution",
],
)
@mark.parametrize("leading_pollution", [True, False])
@mark.parametrize("pollution_within", [True, False])
@mark.parametrize("trailing_pollution", [True, False])
@mark.parametrize(
"pollution",
["==================", "======= ======= =======", "Nnnnnnnnnnnnn nnnnnn nnnnnnnn"],
)
def text_get_text(
blank_nlp,
leading_text: str,
leading_pollution: bool,
pollution_within: bool,
trailing_pollution: bool,
pollution: str,
):
if pollution_within:
example = f"transplantation {pollution} cardiaque en 2000."
else:
example = "transplantation cardiaque en 2000."
chunks = []
if leading_text:
chunks.append(leading_text)
if leading_pollution:
chunks.append(pollution)
chunks.append(example)
if trailing_pollution:
chunks.append(pollution)
text = " ".join(chunks)
blank_nlp.add_pipe("eds.normalizer", config=dict(pollution=True))
blank_nlp.add_pipe(
"eds.matcher",
config=dict(
regex=dict(test="transplantation cardiaque"),
attr="NORM",
ignore_excluded=True,
),
)
doc = blank_nlp(text)
clean = get_text(doc, attr="NORM", ignore_excluded=True)
if leading_text:
assert clean == f"{leading_text.lower()} transplantation cardiaque en 2000."
else:
assert clean == "transplantation cardiaque en 2000."
assert doc.ents
assert doc.ents[0][0].text == "transplantation"
clean = get_text(doc.ents[0], attr="NORM", ignore_excluded=True)
assert clean == "transplantation cardiaque"
def test_groupdict_as_spans(doc):
matcher = RegexMatcher()
matcher.add("test", [r"patient(?i:(?=.*(?P<cause>douleurs))?)"])
[(span0, gd0), (span1, gd1)] = list(matcher.match_with_groupdict_as_spans(doc))
assert span0.text == "patient"
assert span1.text == "patient"
assert len(gd0) == 1 and gd0["cause"].text == "douleurs"
assert len(gd1) == 0
def test_regex_with_space(blank_nlp):
blank_nlp.add_pipe("eds.spaces")
text = "pneumopathie à coronavirus"
doc = blank_nlp(text)
matcher = RegexMatcher(ignore_space_tokens=False)
matcher.add("test", ["pneumopathie à coronavirus"])
assert len(list(matcher(doc, as_spans=True))) == 0
matcher = RegexMatcher(ignore_space_tokens=True)
matcher.add("test", ["pneumopathie à coronavirus"])
match = list(matcher(doc, as_spans=True))[0]
assert match.text == text
assert match._.normalized_variant == "pneumopathie à coronavirus"
@pytest.fixture(scope="session")
def doc2(lang):
blank_nlp = make_nlp(lang)
blank_nlp.add_pipe("eds.pollution")
blank_nlp.add_pipe("eds.spaces")
text = (
"-----------------------------------------------------------------------\n"
"La ………… valeur est NBNbWbWbNbWbNBNb de 24 / 30 milli\n"
"grammes."
)
doc = blank_nlp(text)
return doc
@mark.parametrize("ignore_excluded", [True, False])
@mark.parametrize("ignore_space_tokens", [True, False])
@mark.parametrize("attr", ["TEXT", "NORM"])
@mark.parametrize("full_doc", [True, False])
def test_create_span(
doc2,
ignore_excluded: bool,
ignore_space_tokens: bool,
attr: str,
full_doc: bool,
):
sent = list(doc2.sents)[1]
doclike = doc2 if full_doc else sent
matched_text = get_text(
doclike,
attr=attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
)
clean_tokens = [
t
for t in doclike
if not (
(ignore_excluded and t.tag_ == "EXCLUDED")
or (ignore_space_tokens and t.tag_ == "SPACE")
)
]
filtered_original = doc2[clean_tokens[0].i : clean_tokens[-1].i + 1].text
for pattern, result, alignment_mode in [
(r"4 / 3", "24 / 30", "expand"),
(r"4 / 3", None, "strict"),
(r"4 / 3", "/", "contract"),
(r"24 / 30", "24 / 30", "expand"),
(r"24 / 30", "24 / 30", "strict"),
(r"24 / 30", "24 / 30", "contract"),
(r"24 / 30 milli\s?gra", "24 / 30 milli\ngrammes", "expand"),
(r"24 / 30 milli\s?gra", None, "strict"),
(r"24 / 30 milli\s?gra", "24 / 30 milli\n", "contract"),
(r" 24 / 30 ", "24 / 30", "expand"),
(r" 24 / 30 ", None, "strict"),
(r" 24 / 30 ", "24 / 30", "contract"),
(matched_text, filtered_original, "expand"),
(matched_text, filtered_original, "contract"),
(matched_text, filtered_original, "strict"),
("(?=4 / 3)", "24", "expand"),
("(?=4 / 3)", None, "contract"), # spacy behavior, but it's not ideal
("(?=4 / 3)", None, "strict"),
("(?=24)", "", "expand"),
("(?=24)", None, "contract"), # spacy behavior, but it's not ideal
("(?=24)", None, "strict"),
]:
match = re.search(pattern, matched_text)
span = create_span(
doclike,
start_char=match.start(),
end_char=match.end(),
key="value",
attr=attr,
alignment_mode=alignment_mode,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
)
assert (None if span is None else span.text) == result, (
pattern,
result,
alignment_mode,
)
def test_create_empty_span(blank_nlp):
blank_nlp.add_pipe("eds.pollution")
blank_nlp.add_pipe("eds.spaces")
doc = blank_nlp("plan des addictions:\ntabac :0")
span = create_span(
doc[5:],
0,
0,
"empty",
attr="NORM",
alignment_mode="expand",
ignore_excluded=True,
ignore_space_tokens=True,
)
assert span.start == 5 and span.end == 5
def test_empty_get_text(blank_nlp):
blank_nlp.add_pipe("eds.pollution")
blank_nlp.add_pipe("eds.spaces")
doc = blank_nlp("==================================")
clean = get_text(doc, attr="NORM", ignore_excluded=True, ignore_space_tokens=True)
assert clean == ""