Switch to unified view

a b/tests/pipelines/misc/test_split.py
1
from spacy.tokens import Doc
2
from spacy.tokens.span import Span
3
4
import edsnlp
5
import edsnlp.pipes as eds
6
7
8
def test_split_line_jump():
9
    txt = """This is a test. Another test.
10
11
A third test!"""
12
    nlp = edsnlp.blank("eds")
13
    nlp.add_pipe("eds.matcher", config={"terms": {"test": "test"}})
14
    doc = nlp(txt)
15
    Span.set_extension("test_dict", default={}, force=True)
16
    Doc.set_extension("global_attr", default=None, force=True)
17
    Span.set_extension("ent_attr", default=None, force=True)
18
    doc._.global_attr = "global"
19
    doc.ents[0]._.test_dict = {"key": doc.ents[1]}
20
    doc.ents[0]._.ent_attr = "ent-0"
21
    doc.ents[1]._.test_dict = {"key": doc.ents[0]}
22
    doc.ents[2]._.test_dict = {"key": doc.ents[0]}
23
    doc.ents[2]._.ent_attr = "ent-2"
24
    doc.spans["section"] = [doc[:]]
25
    doc.spans["section"][0]._.ent_attr = "section"
26
    subdocs = list(eds.split(regex="\n\n")(doc))
27
28
    assert len(subdocs) == 2
29
    assert subdocs[0].text == "This is a test. Another test.\n\n"
30
    assert subdocs[0]._.global_attr == "global"
31
    assert subdocs[0].ents[0]._.test_dict["key"] == subdocs[0].ents[1]
32
    assert subdocs[0].ents[0]._.ent_attr == "ent-0"
33
    assert subdocs[0].ents[1]._.test_dict["key"] == subdocs[0].ents[0]
34
    assert subdocs[0].spans["section"][0].text == "This is a test. Another test.\n\n"
35
    assert subdocs[0].spans["section"][0]._.ent_attr == "section"
36
37
    assert subdocs[1].text == "A third test!"
38
    assert subdocs[1]._.global_attr == "global"
39
    assert subdocs[1].ents[0]._.test_dict == {}
40
    assert subdocs[1].ents[0]._.ent_attr == "ent-2"
41
    assert subdocs[1].spans["section"][0].text == "A third test!"
42
    assert subdocs[1].spans["section"][0]._.ent_attr == "section"
43
44
45
def test_filter():
46
    txt = """This is a test. Another test."""
47
    doc = edsnlp.blank("eds")(txt)
48
    subdocs = list(
49
        eds.split(
50
            regex=r"[.!?]\s+()[A-Z]",
51
            filter_expr='"Another" in doc.text',
52
        )(doc)
53
    )
54
    assert len(subdocs) == 1
55
    assert subdocs[0].text == "Another test."
56
57
58
def test_max_length():
59
    txt = (
60
        "Le patient mange des pates depuis le début du confinement, "
61
        "il est donc un peu ballonné, mais pense revenir à un régime plus "
62
        "équilibré en mangeant des légumes et des fruits."
63
    )
64
    doc = edsnlp.blank("eds")(txt)
65
    texts = [d.text for d in eds.split(max_length=12)(doc)]
66
    assert texts == [
67
        "Le patient mange des pates depuis le début du confinement, il ",
68
        "est donc un peu ballonné, mais pense revenir à un régime ",
69
        "plus équilibré en mangeant des légumes et des fruits.",
70
    ]