Switch to unified view

a b/tests/data/test_standoff.py
1
import filecmp
2
import re
3
from itertools import islice
4
from os import listdir
5
from os.path import join
6
from pathlib import Path
7
from random import choice, randint, random
8
from string import ascii_letters, ascii_lowercase
9
10
import pytest
11
12
import edsnlp
13
from edsnlp.connectors.brat import BratConnector
14
from edsnlp.core import PipelineProtocol
15
16
17
def random_word():
18
    n = randint(1, 20)
19
    return "".join(
20
        [choice(ascii_letters)] + [choice(ascii_lowercase) for _ in range(n)]
21
    )
22
23
24
def random_text():
25
    n = randint(30, 60)
26
    return " ".join([random_word() for _ in range(n)])
27
28
29
def random_brat_file(text):
30
    brat = []
31
32
    for match in re.finditer(r"\w+", text):
33
        if random() > 0.8:
34
            line = (
35
                f"T{len(brat) + 1}\tTEST {match.start()} {match.end()}\t{match.group()}"
36
            )
37
            brat.append(line)
38
39
    return "\n".join(brat) + "\n"
40
41
42
@pytest.fixture
43
def brat_folder(tmpdir):
44
    for i in range(100):
45
        text = random_text()
46
        brat = random_brat_file(text)
47
48
        with open(join(tmpdir, f"{i}.txt"), "w") as f:
49
            f.write(text)
50
51
        with open(join(tmpdir, f"{i}.ann"), "w") as f:
52
            if i == 0:
53
                f.write("\n")
54
            else:
55
                f.write(brat)
56
57
    return tmpdir
58
59
60
# OLD BratConnector tests, deprecated
61
62
63
@pytest.fixture
64
def brat1(brat_folder) -> BratConnector:
65
    return BratConnector(brat_folder)
66
67
68
@pytest.fixture
69
def brat2(tmpdir) -> BratConnector:
70
    return BratConnector(tmpdir)
71
72
73
@pytest.fixture
74
def brat_importer():
75
    brat_dir = Path(__file__).parent.parent.resolve() / "resources" / "brat_data"
76
    return BratConnector(str(brat_dir), bool_attributes=["bool flag 0"])
77
78
79
@pytest.fixture
80
def brat_exporter(tmpdir):
81
    return BratConnector(tmpdir, attributes=["etat", "assertion", "bool flag 0"])
82
83
84
def test_empty_brat(brat2: BratConnector, blank_nlp: PipelineProtocol):
85
    with pytest.raises(AssertionError):
86
        brat2.brat2docs(blank_nlp)
87
88
89
def test_brat2brat(
90
    brat1: BratConnector, brat2: BratConnector, blank_nlp: PipelineProtocol
91
):
92
    docs = brat1.brat2docs(blank_nlp)
93
    brat2.docs2brat(docs)
94
95
    files = listdir(brat1.directory)
96
97
    assert files
98
99
    for file in files:
100
        assert file in listdir(brat2.directory)
101
        assert filecmp.cmp(join(brat1.directory, file), join(brat2.directory, file))
102
103
104
def test_docs2brat(nlp, brat2):
105
    text = (
106
        "Le patient est admis pour des douleurs dans le bras droit, "
107
        "mais n'a pas de problème de locomotion. "
108
        "Historique d'AVC dans la famille. pourrait être un cas de rhume.\n"
109
        "NBNbWbWbNbWbNBNbNbWbWbNBNbWbNbNbWbNBNbWbNbNBWbWbNbNbN"
110
        "BWbNbWbNbWBNbNbWbNbNBNbWbWbNbWBNbNbWbNBNbWbWbNb\n"
111
        "Pourrait être un cas de rhume.\n"
112
        "Motif :\n"
113
        "Douleurs dans le bras droit."
114
    )
115
116
    doc1 = nlp(text)
117
    doc1.ents = doc1.spans["pollutions"]
118
119
    doc2 = nlp(text)
120
    doc2.ents = doc2.spans["section_titles"]
121
122
    docs = [doc1, doc2]
123
    for i, doc in enumerate(docs):
124
        doc._.note_id = i + 1
125
126
    brat2.docs2brat(docs)
127
128
129
def assert_doc_read(doc):
130
    assert doc._.note_id == "subfolder/doc-1"
131
132
    attrs = ("etat", "assertion", "bool flag 0")
133
    spans_and_attributes = {
134
        "__ents__": sorted(
135
            [
136
                (e.start, e.end, e.text, tuple(getattr(e._, key) for key in attrs))
137
                for e in doc.ents
138
            ]
139
        ),
140
        **{
141
            name: sorted(
142
                [
143
                    (e.start, e.end, e.text, tuple(getattr(e._, key) for key in attrs))
144
                    for e in doc.spans[name]
145
                ]
146
            )
147
            for name in doc.spans
148
        },
149
    }
150
151
    assert spans_and_attributes == {
152
        "__ents__": [
153
            (6, 7, "douleurs", (None, None, False)),
154
            (7, 11, "dans le bras droit", (None, None, False)),
155
            (17, 21, "problème \nde locomotion", (None, "absent", True)),
156
            (25, 26, "AVC", ("passé", "non-associé", False)),
157
            (35, 36, "rhume", ("présent", "hypothétique", False)),
158
            (45, 46, "rhume", ("présent", "hypothétique", False)),
159
            (51, 52, "Douleurs", (None, None, False)),
160
            (52, 56, "dans le bras droit", (None, None, False)),
161
            (68, 69, "anomalie", (None, "absent", False)),
162
        ],
163
        "anatomie": [
164
            (9, 11, "bras droit", (None, None, False)),
165
            (54, 56, "bras droit", (None, None, False)),
166
        ],
167
        "localisation": [
168
            (7, 11, "dans le bras droit", (None, None, False)),
169
            (52, 56, "dans le bras droit", (None, None, False)),
170
        ],
171
        "pathologie": [
172
            (17, 21, "problème \nde locomotion", (None, "absent", True)),
173
            (25, 26, "AVC", ("passé", "non-associé", False)),
174
            (35, 36, "rhume", ("présent", "hypothétique", False)),
175
            (45, 46, "rhume", ("présent", "hypothétique", False)),
176
        ],
177
        "sosy": [
178
            (6, 7, "douleurs", (None, None, False)),
179
            (51, 52, "Douleurs", (None, None, False)),
180
            (68, 69, "anomalie", (None, "absent", False)),
181
        ],
182
        "test label 0": [(68, 69, "anomalie", (None, "absent", False))],
183
    }
184
185
186
def assert_doc_write(exported_ann_text):
187
    assert exported_ann_text == (
188
        "T1 sosy 30 38  douleurs\n"
189
        "A1 etat T1 test\n"
190
        "T2 localisation 39 57  dans le bras droit\n"
191
        "T3 anatomie 47 57  bras droit\n"
192
        "T4 pathologie 75 83;85 98  problème de locomotion\n"
193
        "A2 assertion T4 absent\n"
194
        "A3 bool flag 0 T4\n"
195
        "T5 pathologie 114 117  AVC\n"
196
        "A4 etat T5 passé\n"
197
        "A5 assertion T5 non-associé\n"
198
        "T6 pathologie 159 164  rhume\n"
199
        "A6 etat T6 présent\n"
200
        "A7 assertion T6 hypothétique\n"
201
        "T7 pathologie 291 296  rhume\n"
202
        "A8 etat T7 présent\n"
203
        "A9 assertion T7 hypothétique\n"
204
        "T8 sosy 306 314    Douleurs\n"
205
        "T9 localisation 315 333    dans le bras droit\n"
206
        "T10    anatomie 323 333    bras droit\n"
207
        "T11    sosy 378 386    anomalie\n"
208
        "A10    assertion T11 absent\n"
209
        "T12    test label 0 378 386    anomalie\n"
210
        "A11    assertion T12 absent\n"
211
    )
212
213
214
def test_brat(
215
    brat_importer: BratConnector,
216
    brat_exporter: BratConnector,
217
    blank_nlp: PipelineProtocol,
218
):
219
    doc = brat_importer.brat2docs(blank_nlp)[0]
220
    assert_doc_read(doc)
221
    doc.ents[0]._.etat = "test"
222
223
    brat_exporter.docs2brat([doc])
224
    with open(brat_exporter.directory / "subfolder" / "doc-1.ann") as f:
225
        exported_ann_text = f.read()
226
227
    assert_doc_write(exported_ann_text)
228
229
230
# New `edsnlp.data.read_standoff` and `edsnlp.data.write_standoff` tests
231
232
233
def test_read_to_standoff(blank_nlp, tmpdir):
234
    input_dir = Path(__file__).parent.parent.resolve() / "resources" / "brat_data"
235
    output_dir = Path(tmpdir)
236
    doc = list(
237
        edsnlp.data.read_standoff(
238
            input_dir,
239
            bool_attributes=["bool flag 0"],
240
            notes_as_span_attribute="cui",
241
        )
242
    )[0]
243
    assert_doc_read(doc)
244
    doc.ents[0]._.etat = "test"
245
    doc.ents[0]._.cui = "C0030193"
246
247
    edsnlp.data.write_standoff(
248
        [doc],
249
        output_dir,
250
        span_attributes=["etat", "assertion", "bool flag 0"],
251
        span_getter=[
252
            "ents",
253
            "sosy",
254
            "localisation",
255
            "anatomie",
256
            "pathologie",
257
            "test label 0",
258
        ],
259
    )
260
261
    with open(output_dir / "subfolder" / "doc-1.ann") as f:
262
        exported_ann_text = f.read()
263
264
    assert_doc_write(exported_ann_text)
265
266
267
@pytest.mark.parametrize("num_cpu_workers", [0, 2])
268
def test_read_shuffle_loop(num_cpu_workers: int):
269
    notes = (
270
        edsnlp.data.read_standoff(
271
            Path(__file__).parent.parent.resolve() / "resources" / "brat_data",
272
            shuffle="dataset",
273
            keep_txt_only_docs=True,
274
            seed=42,
275
            loop=True,
276
        )
277
        .map(lambda x: x._.note_id)
278
        .set_processing(num_cpu_workers=num_cpu_workers)
279
    )
280
    notes = list(islice(notes, 6))
281
    assert notes == [
282
        "subfolder/doc-2",
283
        "subfolder/doc-1",
284
        "subfolder/doc-3",
285
        "subfolder/doc-3",
286
        "subfolder/doc-2",
287
        "subfolder/doc-1",
288
    ]