|
a |
|
b/tests/test_language.py |
|
|
1 |
import pytest |
|
|
2 |
import spacy |
|
|
3 |
from spacy.lang.fr.lex_attrs import like_num |
|
|
4 |
|
|
|
5 |
|
|
|
6 |
def test_eds_tokenizer_handles_long_text(): |
|
|
7 |
text = """Témoin interne : + ; témoin externe : + |
|
|
8 |
- Récepteurs aux œstrogènes : tous les élements sont marqués (3+). |
|
|
9 |
- Récepteurs à la progestérone : 40% sont marqués (intensité 2+). |
|
|
10 |
- Anti-Cerb B2: 0% des cellules carcinomateuses présentent |
|
|
11 |
un marquage membranaire complet. |
|
|
12 |
CONCLUSION`: ======== |
|
|
13 |
-Carcinome mammaire infiltrant du quadrant inféro-externe, |
|
|
14 |
de 25 mm size de grade II de |
|
|
15 |
malignité selon Elston et Ellis (3+2+1), sans composante |
|
|
16 |
Score ACR5 de chaque coté`'. |
|
|
17 |
On se donne rendez-vous pour le 23/11/1967. |
|
|
18 |
On fait des mesures de E.coli dans la seine. |
|
|
19 |
Le sang est de type O. Promis ! |
|
|
20 |
""" |
|
|
21 |
nlp = spacy.blank("eds") |
|
|
22 |
tokens = nlp(text) |
|
|
23 |
assert ( |
|
|
24 |
[t.text_with_ws for t in tokens] |
|
|
25 |
== """Témoin |interne |: |+ |; |témoin |externe |: |+| |
|
|
26 |
|- |Récepteurs |aux |œstrogènes |: |tous |les |élements |sont |marqués |(|3|+|)|.| |
|
|
27 |
|- |Récepteurs |à |la |progestérone |: |40|% |sont |marqués |(|intensité |2|+|)|.| |
|
|
28 |
|- |Anti|-|Cerb |B|2|: |0|% |des |cellules |carcinomateuses |présentent| |
|
|
29 |
|un |marquage |membranaire |complet|.| |
|
|
30 |
|CONCLUSION|`|: |=|=|=|=|=|=|=|=| |
|
|
31 |
|-|Carcinome |mammaire |infiltrant |du |quadrant |inféro|-|externe|,| |
|
|
32 |
|de |25 |mm |size |de |grade |II |de| |
|
|
33 |
|malignité |selon |Elston |et |Ellis |(|3|+|2|+|1|)|, |sans |composante| |
|
|
34 |
|Score |ACR|5 |de |chaque |coté|`|'|.| |
|
|
35 |
|On |se |donne |rendez|-|vous |pour |le |23|/|11|/|1967|.| |
|
|
36 |
|On |fait |des |mesures |de |E.|coli |dans |la |seine|.| |
|
|
37 |
|Le |sang |est |de |type |O|. |Promis |!| |
|
|
38 |
""".split("|") |
|
|
39 |
) |
|
|
40 |
|
|
|
41 |
|
|
|
42 |
@pytest.mark.parametrize("word", ["onze", "onzième"]) |
|
|
43 |
def test_eds_lex_attrs_capitals(word): |
|
|
44 |
assert like_num(word) |
|
|
45 |
assert like_num(word.upper()) |
|
|
46 |
|
|
|
47 |
|
|
|
48 |
def test_eds_tokenizer_whitespace(): |
|
|
49 |
nlp = spacy.blank("eds") |
|
|
50 |
tokenized = [(w.text, w.whitespace_) for w in nlp("Lorem\xA0Ipsum\tDolor Sit Amet")] |
|
|
51 |
assert tokenized == [ |
|
|
52 |
("Lorem", " "), |
|
|
53 |
("Ipsum", ""), |
|
|
54 |
("\t", ""), |
|
|
55 |
("Dolor", " "), |
|
|
56 |
("Sit", " "), |
|
|
57 |
("Amet", ""), |
|
|
58 |
] |
|
|
59 |
|
|
|
60 |
|
|
|
61 |
def test_eds_tokenizer_numbers(): |
|
|
62 |
nlp = spacy.blank("eds") |
|
|
63 |
tokenized = [(w.text, w.whitespace_) for w in nlp("Il fait 5.3/5.4mm")] |
|
|
64 |
assert tokenized == [ |
|
|
65 |
("Il", " "), |
|
|
66 |
("fait", " "), |
|
|
67 |
("5.3", ""), |
|
|
68 |
("/", ""), |
|
|
69 |
("5.4", ""), |
|
|
70 |
("mm", ""), |
|
|
71 |
] |
|
|
72 |
|
|
|
73 |
|
|
|
74 |
def test_eds_tokenizer_exceptions(): |
|
|
75 |
nlp = spacy.blank("eds") |
|
|
76 |
txt = "M. Gentil a un rhume, code ADICAP: B.H.HP.A7A0" |
|
|
77 |
tokenized = [(w.text, w.whitespace_) for w in nlp(txt)] |
|
|
78 |
assert tokenized == [ |
|
|
79 |
("M.", " "), |
|
|
80 |
("Gentil", " "), |
|
|
81 |
("a", " "), |
|
|
82 |
("un", " "), |
|
|
83 |
("rhume", ""), |
|
|
84 |
(",", " "), |
|
|
85 |
("code", " "), |
|
|
86 |
("ADICAP", ""), |
|
|
87 |
(":", " "), |
|
|
88 |
("B.", ""), |
|
|
89 |
("H.", ""), |
|
|
90 |
("HP.", ""), |
|
|
91 |
("A", ""), |
|
|
92 |
("7", ""), |
|
|
93 |
("A", ""), |
|
|
94 |
("0", ""), |
|
|
95 |
] |