|
a |
|
b/tests/data/test_standoff.py |
|
|
1 |
import filecmp |
|
|
2 |
import re |
|
|
3 |
from itertools import islice |
|
|
4 |
from os import listdir |
|
|
5 |
from os.path import join |
|
|
6 |
from pathlib import Path |
|
|
7 |
from random import choice, randint, random |
|
|
8 |
from string import ascii_letters, ascii_lowercase |
|
|
9 |
|
|
|
10 |
import pytest |
|
|
11 |
|
|
|
12 |
import edsnlp |
|
|
13 |
from edsnlp.connectors.brat import BratConnector |
|
|
14 |
from edsnlp.core import PipelineProtocol |
|
|
15 |
|
|
|
16 |
|
|
|
17 |
def random_word(): |
|
|
18 |
n = randint(1, 20) |
|
|
19 |
return "".join( |
|
|
20 |
[choice(ascii_letters)] + [choice(ascii_lowercase) for _ in range(n)] |
|
|
21 |
) |
|
|
22 |
|
|
|
23 |
|
|
|
24 |
def random_text(): |
|
|
25 |
n = randint(30, 60) |
|
|
26 |
return " ".join([random_word() for _ in range(n)]) |
|
|
27 |
|
|
|
28 |
|
|
|
29 |
def random_brat_file(text): |
|
|
30 |
brat = [] |
|
|
31 |
|
|
|
32 |
for match in re.finditer(r"\w+", text): |
|
|
33 |
if random() > 0.8: |
|
|
34 |
line = ( |
|
|
35 |
f"T{len(brat) + 1}\tTEST {match.start()} {match.end()}\t{match.group()}" |
|
|
36 |
) |
|
|
37 |
brat.append(line) |
|
|
38 |
|
|
|
39 |
return "\n".join(brat) + "\n" |
|
|
40 |
|
|
|
41 |
|
|
|
42 |
@pytest.fixture |
|
|
43 |
def brat_folder(tmpdir): |
|
|
44 |
for i in range(100): |
|
|
45 |
text = random_text() |
|
|
46 |
brat = random_brat_file(text) |
|
|
47 |
|
|
|
48 |
with open(join(tmpdir, f"{i}.txt"), "w") as f: |
|
|
49 |
f.write(text) |
|
|
50 |
|
|
|
51 |
with open(join(tmpdir, f"{i}.ann"), "w") as f: |
|
|
52 |
if i == 0: |
|
|
53 |
f.write("\n") |
|
|
54 |
else: |
|
|
55 |
f.write(brat) |
|
|
56 |
|
|
|
57 |
return tmpdir |
|
|
58 |
|
|
|
59 |
|
|
|
60 |
# OLD BratConnector tests, deprecated |
|
|
61 |
|
|
|
62 |
|
|
|
63 |
@pytest.fixture |
|
|
64 |
def brat1(brat_folder) -> BratConnector: |
|
|
65 |
return BratConnector(brat_folder) |
|
|
66 |
|
|
|
67 |
|
|
|
68 |
@pytest.fixture |
|
|
69 |
def brat2(tmpdir) -> BratConnector: |
|
|
70 |
return BratConnector(tmpdir) |
|
|
71 |
|
|
|
72 |
|
|
|
73 |
@pytest.fixture |
|
|
74 |
def brat_importer(): |
|
|
75 |
brat_dir = Path(__file__).parent.parent.resolve() / "resources" / "brat_data" |
|
|
76 |
return BratConnector(str(brat_dir), bool_attributes=["bool flag 0"]) |
|
|
77 |
|
|
|
78 |
|
|
|
79 |
@pytest.fixture |
|
|
80 |
def brat_exporter(tmpdir): |
|
|
81 |
return BratConnector(tmpdir, attributes=["etat", "assertion", "bool flag 0"]) |
|
|
82 |
|
|
|
83 |
|
|
|
84 |
def test_empty_brat(brat2: BratConnector, blank_nlp: PipelineProtocol): |
|
|
85 |
with pytest.raises(AssertionError): |
|
|
86 |
brat2.brat2docs(blank_nlp) |
|
|
87 |
|
|
|
88 |
|
|
|
89 |
def test_brat2brat( |
|
|
90 |
brat1: BratConnector, brat2: BratConnector, blank_nlp: PipelineProtocol |
|
|
91 |
): |
|
|
92 |
docs = brat1.brat2docs(blank_nlp) |
|
|
93 |
brat2.docs2brat(docs) |
|
|
94 |
|
|
|
95 |
files = listdir(brat1.directory) |
|
|
96 |
|
|
|
97 |
assert files |
|
|
98 |
|
|
|
99 |
for file in files: |
|
|
100 |
assert file in listdir(brat2.directory) |
|
|
101 |
assert filecmp.cmp(join(brat1.directory, file), join(brat2.directory, file)) |
|
|
102 |
|
|
|
103 |
|
|
|
104 |
def test_docs2brat(nlp, brat2): |
|
|
105 |
text = ( |
|
|
106 |
"Le patient est admis pour des douleurs dans le bras droit, " |
|
|
107 |
"mais n'a pas de problème de locomotion. " |
|
|
108 |
"Historique d'AVC dans la famille. pourrait être un cas de rhume.\n" |
|
|
109 |
"NBNbWbWbNbWbNBNbNbWbWbNBNbWbNbNbWbNBNbWbNbNBWbWbNbNbN" |
|
|
110 |
"BWbNbWbNbWBNbNbWbNbNBNbWbWbNbWBNbNbWbNBNbWbWbNb\n" |
|
|
111 |
"Pourrait être un cas de rhume.\n" |
|
|
112 |
"Motif :\n" |
|
|
113 |
"Douleurs dans le bras droit." |
|
|
114 |
) |
|
|
115 |
|
|
|
116 |
doc1 = nlp(text) |
|
|
117 |
doc1.ents = doc1.spans["pollutions"] |
|
|
118 |
|
|
|
119 |
doc2 = nlp(text) |
|
|
120 |
doc2.ents = doc2.spans["section_titles"] |
|
|
121 |
|
|
|
122 |
docs = [doc1, doc2] |
|
|
123 |
for i, doc in enumerate(docs): |
|
|
124 |
doc._.note_id = i + 1 |
|
|
125 |
|
|
|
126 |
brat2.docs2brat(docs) |
|
|
127 |
|
|
|
128 |
|
|
|
129 |
def assert_doc_read(doc): |
|
|
130 |
assert doc._.note_id == "subfolder/doc-1" |
|
|
131 |
|
|
|
132 |
attrs = ("etat", "assertion", "bool flag 0") |
|
|
133 |
spans_and_attributes = { |
|
|
134 |
"__ents__": sorted( |
|
|
135 |
[ |
|
|
136 |
(e.start, e.end, e.text, tuple(getattr(e._, key) for key in attrs)) |
|
|
137 |
for e in doc.ents |
|
|
138 |
] |
|
|
139 |
), |
|
|
140 |
**{ |
|
|
141 |
name: sorted( |
|
|
142 |
[ |
|
|
143 |
(e.start, e.end, e.text, tuple(getattr(e._, key) for key in attrs)) |
|
|
144 |
for e in doc.spans[name] |
|
|
145 |
] |
|
|
146 |
) |
|
|
147 |
for name in doc.spans |
|
|
148 |
}, |
|
|
149 |
} |
|
|
150 |
|
|
|
151 |
assert spans_and_attributes == { |
|
|
152 |
"__ents__": [ |
|
|
153 |
(6, 7, "douleurs", (None, None, False)), |
|
|
154 |
(7, 11, "dans le bras droit", (None, None, False)), |
|
|
155 |
(17, 21, "problème \nde locomotion", (None, "absent", True)), |
|
|
156 |
(25, 26, "AVC", ("passé", "non-associé", False)), |
|
|
157 |
(35, 36, "rhume", ("présent", "hypothétique", False)), |
|
|
158 |
(45, 46, "rhume", ("présent", "hypothétique", False)), |
|
|
159 |
(51, 52, "Douleurs", (None, None, False)), |
|
|
160 |
(52, 56, "dans le bras droit", (None, None, False)), |
|
|
161 |
(68, 69, "anomalie", (None, "absent", False)), |
|
|
162 |
], |
|
|
163 |
"anatomie": [ |
|
|
164 |
(9, 11, "bras droit", (None, None, False)), |
|
|
165 |
(54, 56, "bras droit", (None, None, False)), |
|
|
166 |
], |
|
|
167 |
"localisation": [ |
|
|
168 |
(7, 11, "dans le bras droit", (None, None, False)), |
|
|
169 |
(52, 56, "dans le bras droit", (None, None, False)), |
|
|
170 |
], |
|
|
171 |
"pathologie": [ |
|
|
172 |
(17, 21, "problème \nde locomotion", (None, "absent", True)), |
|
|
173 |
(25, 26, "AVC", ("passé", "non-associé", False)), |
|
|
174 |
(35, 36, "rhume", ("présent", "hypothétique", False)), |
|
|
175 |
(45, 46, "rhume", ("présent", "hypothétique", False)), |
|
|
176 |
], |
|
|
177 |
"sosy": [ |
|
|
178 |
(6, 7, "douleurs", (None, None, False)), |
|
|
179 |
(51, 52, "Douleurs", (None, None, False)), |
|
|
180 |
(68, 69, "anomalie", (None, "absent", False)), |
|
|
181 |
], |
|
|
182 |
"test label 0": [(68, 69, "anomalie", (None, "absent", False))], |
|
|
183 |
} |
|
|
184 |
|
|
|
185 |
|
|
|
186 |
def assert_doc_write(exported_ann_text): |
|
|
187 |
assert exported_ann_text == ( |
|
|
188 |
"T1 sosy 30 38 douleurs\n" |
|
|
189 |
"A1 etat T1 test\n" |
|
|
190 |
"T2 localisation 39 57 dans le bras droit\n" |
|
|
191 |
"T3 anatomie 47 57 bras droit\n" |
|
|
192 |
"T4 pathologie 75 83;85 98 problème de locomotion\n" |
|
|
193 |
"A2 assertion T4 absent\n" |
|
|
194 |
"A3 bool flag 0 T4\n" |
|
|
195 |
"T5 pathologie 114 117 AVC\n" |
|
|
196 |
"A4 etat T5 passé\n" |
|
|
197 |
"A5 assertion T5 non-associé\n" |
|
|
198 |
"T6 pathologie 159 164 rhume\n" |
|
|
199 |
"A6 etat T6 présent\n" |
|
|
200 |
"A7 assertion T6 hypothétique\n" |
|
|
201 |
"T7 pathologie 291 296 rhume\n" |
|
|
202 |
"A8 etat T7 présent\n" |
|
|
203 |
"A9 assertion T7 hypothétique\n" |
|
|
204 |
"T8 sosy 306 314 Douleurs\n" |
|
|
205 |
"T9 localisation 315 333 dans le bras droit\n" |
|
|
206 |
"T10 anatomie 323 333 bras droit\n" |
|
|
207 |
"T11 sosy 378 386 anomalie\n" |
|
|
208 |
"A10 assertion T11 absent\n" |
|
|
209 |
"T12 test label 0 378 386 anomalie\n" |
|
|
210 |
"A11 assertion T12 absent\n" |
|
|
211 |
) |
|
|
212 |
|
|
|
213 |
|
|
|
214 |
def test_brat( |
|
|
215 |
brat_importer: BratConnector, |
|
|
216 |
brat_exporter: BratConnector, |
|
|
217 |
blank_nlp: PipelineProtocol, |
|
|
218 |
): |
|
|
219 |
doc = brat_importer.brat2docs(blank_nlp)[0] |
|
|
220 |
assert_doc_read(doc) |
|
|
221 |
doc.ents[0]._.etat = "test" |
|
|
222 |
|
|
|
223 |
brat_exporter.docs2brat([doc]) |
|
|
224 |
with open(brat_exporter.directory / "subfolder" / "doc-1.ann") as f: |
|
|
225 |
exported_ann_text = f.read() |
|
|
226 |
|
|
|
227 |
assert_doc_write(exported_ann_text) |
|
|
228 |
|
|
|
229 |
|
|
|
230 |
# New `edsnlp.data.read_standoff` and `edsnlp.data.write_standoff` tests |
|
|
231 |
|
|
|
232 |
|
|
|
233 |
def test_read_to_standoff(blank_nlp, tmpdir): |
|
|
234 |
input_dir = Path(__file__).parent.parent.resolve() / "resources" / "brat_data" |
|
|
235 |
output_dir = Path(tmpdir) |
|
|
236 |
doc = list( |
|
|
237 |
edsnlp.data.read_standoff( |
|
|
238 |
input_dir, |
|
|
239 |
bool_attributes=["bool flag 0"], |
|
|
240 |
notes_as_span_attribute="cui", |
|
|
241 |
) |
|
|
242 |
)[0] |
|
|
243 |
assert_doc_read(doc) |
|
|
244 |
doc.ents[0]._.etat = "test" |
|
|
245 |
doc.ents[0]._.cui = "C0030193" |
|
|
246 |
|
|
|
247 |
edsnlp.data.write_standoff( |
|
|
248 |
[doc], |
|
|
249 |
output_dir, |
|
|
250 |
span_attributes=["etat", "assertion", "bool flag 0"], |
|
|
251 |
span_getter=[ |
|
|
252 |
"ents", |
|
|
253 |
"sosy", |
|
|
254 |
"localisation", |
|
|
255 |
"anatomie", |
|
|
256 |
"pathologie", |
|
|
257 |
"test label 0", |
|
|
258 |
], |
|
|
259 |
) |
|
|
260 |
|
|
|
261 |
with open(output_dir / "subfolder" / "doc-1.ann") as f: |
|
|
262 |
exported_ann_text = f.read() |
|
|
263 |
|
|
|
264 |
assert_doc_write(exported_ann_text) |
|
|
265 |
|
|
|
266 |
|
|
|
267 |
@pytest.mark.parametrize("num_cpu_workers", [0, 2]) |
|
|
268 |
def test_read_shuffle_loop(num_cpu_workers: int): |
|
|
269 |
notes = ( |
|
|
270 |
edsnlp.data.read_standoff( |
|
|
271 |
Path(__file__).parent.parent.resolve() / "resources" / "brat_data", |
|
|
272 |
shuffle="dataset", |
|
|
273 |
keep_txt_only_docs=True, |
|
|
274 |
seed=42, |
|
|
275 |
loop=True, |
|
|
276 |
) |
|
|
277 |
.map(lambda x: x._.note_id) |
|
|
278 |
.set_processing(num_cpu_workers=num_cpu_workers) |
|
|
279 |
) |
|
|
280 |
notes = list(islice(notes, 6)) |
|
|
281 |
assert notes == [ |
|
|
282 |
"subfolder/doc-2", |
|
|
283 |
"subfolder/doc-1", |
|
|
284 |
"subfolder/doc-3", |
|
|
285 |
"subfolder/doc-3", |
|
|
286 |
"subfolder/doc-2", |
|
|
287 |
"subfolder/doc-1", |
|
|
288 |
] |