edsnlp / Git / Diff of /tests/matchers/test

Models:
philipB/
edsnlp
Downloads: 1
Diff of /tests/matchers/test_simstring.py [000000] .. [cad161]
Switch to side-by-side view

--- a
+++ b/tests/matchers/test_simstring.py
@@ -0,0 +1,100 @@
+from pytest import mark
+
+from edsnlp.matchers.simstring import SimstringMatcher
+
+
+def test_simstring_matcher(doc, nlp):
+    matcher = SimstringMatcher(nlp.vocab, attr="TEXT")
+
+    matcher.build_patterns(
+        nlp,
+        {
+            "patient": ["patient"],
+            "locomotion": ["locomotions"],
+        },
+    )
+
+    matches = [m.text for m in matcher(doc, as_spans=True)]
+
+    assert matches == ["patient", "locomotion", "patient"]
+
+
+def test_with_normalizer(blank_nlp):
+    blank_nlp.add_pipe("eds.normalizer")
+    pattern = "matching"
+    matcher = SimstringMatcher(
+        blank_nlp.vocab,
+        attr="NORM",
+        threshold=0.75,
+        measure="dice",
+        ignore_space_tokens=True,
+        ignore_excluded=True,
+    )
+
+    matcher.build_patterns(
+        blank_nlp,
+        {
+            "test": [pattern],
+            "C220": ["carcinome hépatocellulaire", "carc. hépatocellulaire"],
+            "N02BE01": ["paracetamol"],
+        },
+    )
+
+    texts = (
+        ("Ceci est un test de matching", ["matching"]),
+        ("Ceci est un test de matchings", ["matchings"]),
+        ("On prescrit du paracétomol      , un medicament.", ["paracétomol"]),
+        (
+            "Le patient a un carcinome\nhépatacellulaire !",
+            ["carcinome\nhépatacellulaire"],
+        ),
+    )
+
+    for text, ents in texts:
+        doc = blank_nlp(text)
+        matches = list(matcher(doc))
+        assert len(matches) > 0
+
+        assert sorted([m.text for m in matcher(doc[2:], as_spans=True)]) == ents
+        assert sorted([doc[s:e].text for _, s, e in matcher(doc[2:])]) == ents
+
+        assert sorted([m.text for m in matcher(doc, as_spans=True)]) == ents
+        assert sorted([doc[s:e].text for _, s, e in matcher(doc)]) == ents
+
+
+@mark.parametrize("measure", ["dice", "cosine", "jaccard", "overlap"])
+def test_without_normalizer(blank_nlp, measure):
+    pattern = "matching"
+    matcher = SimstringMatcher(
+        blank_nlp.vocab, attr="NORM", threshold=0.6, measure=measure
+    )
+
+    matcher.build_patterns(
+        blank_nlp,
+        {
+            "test": [pattern],
+            "C220": ["carcinome hépatocellulaire", "carc. hépatocellulaire"],
+            "N02BE01": ["paracétamol"],
+        },
+    )
+
+    texts = (
+        ("Ceci est un test de matching", ["matching"]),
+        ("Ceci est un test de matchings", ["matchings"]),
+        ("On prescrit du paracétomol, un médicament.", ["paracétomol"]),
+        (
+            "Le patient a un carcinome hépatacellulaire !",
+            ["carcinome hépatacellulaire"],
+        ),
+    )
+
+    for text, ents in texts:
+        doc = blank_nlp(text)
+        matches = list(matcher(doc))
+        assert len(matches) > 0
+
+        assert sorted([m.text for m in matcher(doc[2:], as_spans=True)]) == ents
+        assert sorted([doc[s:e].text for _, s, e in matcher(doc[2:])]) == ents
+
+        assert sorted([m.text for m in matcher(doc, as_spans=True)]) == ents
+        assert sorted([doc[s:e].text for _, s, e in matcher(doc)]) == ents