|
a |
|
b/tests/pipelines/misc/test_split.py |
|
|
1 |
from spacy.tokens import Doc |
|
|
2 |
from spacy.tokens.span import Span |
|
|
3 |
|
|
|
4 |
import edsnlp |
|
|
5 |
import edsnlp.pipes as eds |
|
|
6 |
|
|
|
7 |
|
|
|
8 |
def test_split_line_jump(): |
|
|
9 |
txt = """This is a test. Another test. |
|
|
10 |
|
|
|
11 |
A third test!""" |
|
|
12 |
nlp = edsnlp.blank("eds") |
|
|
13 |
nlp.add_pipe("eds.matcher", config={"terms": {"test": "test"}}) |
|
|
14 |
doc = nlp(txt) |
|
|
15 |
Span.set_extension("test_dict", default={}, force=True) |
|
|
16 |
Doc.set_extension("global_attr", default=None, force=True) |
|
|
17 |
Span.set_extension("ent_attr", default=None, force=True) |
|
|
18 |
doc._.global_attr = "global" |
|
|
19 |
doc.ents[0]._.test_dict = {"key": doc.ents[1]} |
|
|
20 |
doc.ents[0]._.ent_attr = "ent-0" |
|
|
21 |
doc.ents[1]._.test_dict = {"key": doc.ents[0]} |
|
|
22 |
doc.ents[2]._.test_dict = {"key": doc.ents[0]} |
|
|
23 |
doc.ents[2]._.ent_attr = "ent-2" |
|
|
24 |
doc.spans["section"] = [doc[:]] |
|
|
25 |
doc.spans["section"][0]._.ent_attr = "section" |
|
|
26 |
subdocs = list(eds.split(regex="\n\n")(doc)) |
|
|
27 |
|
|
|
28 |
assert len(subdocs) == 2 |
|
|
29 |
assert subdocs[0].text == "This is a test. Another test.\n\n" |
|
|
30 |
assert subdocs[0]._.global_attr == "global" |
|
|
31 |
assert subdocs[0].ents[0]._.test_dict["key"] == subdocs[0].ents[1] |
|
|
32 |
assert subdocs[0].ents[0]._.ent_attr == "ent-0" |
|
|
33 |
assert subdocs[0].ents[1]._.test_dict["key"] == subdocs[0].ents[0] |
|
|
34 |
assert subdocs[0].spans["section"][0].text == "This is a test. Another test.\n\n" |
|
|
35 |
assert subdocs[0].spans["section"][0]._.ent_attr == "section" |
|
|
36 |
|
|
|
37 |
assert subdocs[1].text == "A third test!" |
|
|
38 |
assert subdocs[1]._.global_attr == "global" |
|
|
39 |
assert subdocs[1].ents[0]._.test_dict == {} |
|
|
40 |
assert subdocs[1].ents[0]._.ent_attr == "ent-2" |
|
|
41 |
assert subdocs[1].spans["section"][0].text == "A third test!" |
|
|
42 |
assert subdocs[1].spans["section"][0]._.ent_attr == "section" |
|
|
43 |
|
|
|
44 |
|
|
|
45 |
def test_filter(): |
|
|
46 |
txt = """This is a test. Another test.""" |
|
|
47 |
doc = edsnlp.blank("eds")(txt) |
|
|
48 |
subdocs = list( |
|
|
49 |
eds.split( |
|
|
50 |
regex=r"[.!?]\s+()[A-Z]", |
|
|
51 |
filter_expr='"Another" in doc.text', |
|
|
52 |
)(doc) |
|
|
53 |
) |
|
|
54 |
assert len(subdocs) == 1 |
|
|
55 |
assert subdocs[0].text == "Another test." |
|
|
56 |
|
|
|
57 |
|
|
|
58 |
def test_max_length(): |
|
|
59 |
txt = ( |
|
|
60 |
"Le patient mange des pates depuis le début du confinement, " |
|
|
61 |
"il est donc un peu ballonné, mais pense revenir à un régime plus " |
|
|
62 |
"équilibré en mangeant des légumes et des fruits." |
|
|
63 |
) |
|
|
64 |
doc = edsnlp.blank("eds")(txt) |
|
|
65 |
texts = [d.text for d in eds.split(max_length=12)(doc)] |
|
|
66 |
assert texts == [ |
|
|
67 |
"Le patient mange des pates depuis le début du confinement, il ", |
|
|
68 |
"est donc un peu ballonné, mais pense revenir à un régime ", |
|
|
69 |
"plus équilibré en mangeant des légumes et des fruits.", |
|
|
70 |
] |