[cad161]: / tests / matchers / test_simstring.py

Download this file

101 lines (80 with data), 3.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
from pytest import mark
from edsnlp.matchers.simstring import SimstringMatcher
def test_simstring_matcher(doc, nlp):
matcher = SimstringMatcher(nlp.vocab, attr="TEXT")
matcher.build_patterns(
nlp,
{
"patient": ["patient"],
"locomotion": ["locomotions"],
},
)
matches = [m.text for m in matcher(doc, as_spans=True)]
assert matches == ["patient", "locomotion", "patient"]
def test_with_normalizer(blank_nlp):
blank_nlp.add_pipe("eds.normalizer")
pattern = "matching"
matcher = SimstringMatcher(
blank_nlp.vocab,
attr="NORM",
threshold=0.75,
measure="dice",
ignore_space_tokens=True,
ignore_excluded=True,
)
matcher.build_patterns(
blank_nlp,
{
"test": [pattern],
"C220": ["carcinome hépatocellulaire", "carc. hépatocellulaire"],
"N02BE01": ["paracetamol"],
},
)
texts = (
("Ceci est un test de matching", ["matching"]),
("Ceci est un test de matchings", ["matchings"]),
("On prescrit du paracétomol , un medicament.", ["paracétomol"]),
(
"Le patient a un carcinome\nhépatacellulaire !",
["carcinome\nhépatacellulaire"],
),
)
for text, ents in texts:
doc = blank_nlp(text)
matches = list(matcher(doc))
assert len(matches) > 0
assert sorted([m.text for m in matcher(doc[2:], as_spans=True)]) == ents
assert sorted([doc[s:e].text for _, s, e in matcher(doc[2:])]) == ents
assert sorted([m.text for m in matcher(doc, as_spans=True)]) == ents
assert sorted([doc[s:e].text for _, s, e in matcher(doc)]) == ents
@mark.parametrize("measure", ["dice", "cosine", "jaccard", "overlap"])
def test_without_normalizer(blank_nlp, measure):
pattern = "matching"
matcher = SimstringMatcher(
blank_nlp.vocab, attr="NORM", threshold=0.6, measure=measure
)
matcher.build_patterns(
blank_nlp,
{
"test": [pattern],
"C220": ["carcinome hépatocellulaire", "carc. hépatocellulaire"],
"N02BE01": ["paracétamol"],
},
)
texts = (
("Ceci est un test de matching", ["matching"]),
("Ceci est un test de matchings", ["matchings"]),
("On prescrit du paracétomol, un médicament.", ["paracétomol"]),
(
"Le patient a un carcinome hépatacellulaire !",
["carcinome hépatacellulaire"],
),
)
for text, ents in texts:
doc = blank_nlp(text)
matches = list(matcher(doc))
assert len(matches) > 0
assert sorted([m.text for m in matcher(doc[2:], as_spans=True)]) == ents
assert sorted([doc[s:e].text for _, s, e in matcher(doc[2:])]) == ents
assert sorted([m.text for m in matcher(doc, as_spans=True)]) == ents
assert sorted([doc[s:e].text for _, s, e in matcher(doc)]) == ents