a b/tests/data/test_json.py
1
import json
2
from itertools import islice
3
from pathlib import Path
4
5
import pytest
6
from typing_extensions import Literal
7
8
import edsnlp
9
10
11
def assert_doc_read(doc):
12
    assert doc._.note_id == "subfolder/doc-1"
13
    assert doc._.context_var == "test"
14
15
    attrs = ("etat", "assertion")
16
    spans_and_attributes = {
17
        "__ents__": sorted(
18
            [
19
                (e.start, e.end, e.text, tuple(getattr(e._, key) for key in attrs))
20
                for e in doc.ents
21
            ]
22
        ),
23
        **{
24
            name: sorted(
25
                [
26
                    (e.start, e.end, e.text, tuple(getattr(e._, key) for key in attrs))
27
                    for e in doc.spans[name]
28
                ]
29
            )
30
            for name in doc.spans
31
        },
32
    }
33
34
    assert spans_and_attributes == {
35
        "__ents__": [
36
            (6, 7, "douleurs", (None, None)),
37
            (7, 11, "dans le bras droit", (None, None)),
38
            (17, 21, "problème \nde locomotion", (None, "absent")),
39
            (25, 26, "AVC", ("passé", "non-associé")),
40
            (35, 36, "rhume", ("présent", "hypothétique")),
41
            (45, 46, "rhume", ("présent", "hypothétique")),
42
            (51, 52, "Douleurs", (None, None)),
43
            (52, 56, "dans le bras droit", (None, None)),
44
            (68, 69, "anomalie", (None, "absent")),
45
        ],
46
        "anatomie": [
47
            (9, 11, "bras droit", (None, None)),
48
            (54, 56, "bras droit", (None, None)),
49
        ],
50
        "localisation": [
51
            (7, 11, "dans le bras droit", (None, None)),
52
            (52, 56, "dans le bras droit", (None, None)),
53
        ],
54
        "pathologie": [
55
            (17, 21, "problème \nde locomotion", (None, "absent")),
56
            (25, 26, "AVC", ("passé", "non-associé")),
57
            (35, 36, "rhume", ("présent", "hypothétique")),
58
            (45, 46, "rhume", ("présent", "hypothétique")),
59
        ],
60
        "sosy": [
61
            (6, 7, "douleurs", (None, None)),
62
            (51, 52, "Douleurs", (None, None)),
63
            (68, 69, "anomalie", (None, "absent")),
64
        ],
65
    }
66
67
68
def assert_doc_write(exported_obj):
69
    assert exported_obj == {
70
        "entities": [
71
            {
72
                "assertion": None,
73
                "end_char": 38,
74
                "etat": "test",
75
                "lexical_variant": "douleurs",
76
                "note_nlp_id": 0,
77
                "note_nlp_source_value": "sosy",
78
                "start_char": 30,
79
            },
80
            {
81
                "assertion": None,
82
                "end_char": 57,
83
                "etat": None,
84
                "lexical_variant": "dans le bras droit",
85
                "note_nlp_id": 1,
86
                "note_nlp_source_value": "localisation",
87
                "start_char": 39,
88
            },
89
            {
90
                "assertion": None,
91
                "end_char": 57,
92
                "etat": None,
93
                "lexical_variant": "bras droit",
94
                "note_nlp_id": 2,
95
                "note_nlp_source_value": "anatomie",
96
                "start_char": 47,
97
            },
98
            {
99
                "assertion": "absent",
100
                "end_char": 98,
101
                "etat": None,
102
                "lexical_variant": "problème \nde locomotion",
103
                "note_nlp_id": 3,
104
                "note_nlp_source_value": "pathologie",
105
                "start_char": 75,
106
            },
107
            {
108
                "assertion": "non-associé",
109
                "end_char": 117,
110
                "etat": "passé",
111
                "lexical_variant": "AVC",
112
                "note_nlp_id": 4,
113
                "note_nlp_source_value": "pathologie",
114
                "start_char": 114,
115
            },
116
            {
117
                "assertion": "hypothétique",
118
                "end_char": 164,
119
                "etat": "présent",
120
                "lexical_variant": "rhume",
121
                "note_nlp_id": 5,
122
                "note_nlp_source_value": "pathologie",
123
                "start_char": 159,
124
            },
125
            {
126
                "assertion": "hypothétique",
127
                "end_char": 296,
128
                "etat": "présent",
129
                "lexical_variant": "rhume",
130
                "note_nlp_id": 6,
131
                "note_nlp_source_value": "pathologie",
132
                "start_char": 291,
133
            },
134
            {
135
                "assertion": None,
136
                "end_char": 314,
137
                "etat": None,
138
                "lexical_variant": "Douleurs",
139
                "note_nlp_id": 7,
140
                "note_nlp_source_value": "sosy",
141
                "start_char": 306,
142
            },
143
            {
144
                "assertion": None,
145
                "end_char": 333,
146
                "etat": None,
147
                "lexical_variant": "dans le bras droit",
148
                "note_nlp_id": 8,
149
                "note_nlp_source_value": "localisation",
150
                "start_char": 315,
151
            },
152
            {
153
                "assertion": None,
154
                "end_char": 333,
155
                "etat": None,
156
                "lexical_variant": "bras droit",
157
                "note_nlp_id": 9,
158
                "note_nlp_source_value": "anatomie",
159
                "start_char": 323,
160
            },
161
            {
162
                "assertion": "absent",
163
                "end_char": 386,
164
                "etat": None,
165
                "lexical_variant": "anomalie",
166
                "note_nlp_id": 10,
167
                "note_nlp_source_value": "sosy",
168
                "start_char": 378,
169
            },
170
        ],
171
        "note_id": "subfolder/doc-1",
172
        "context_var": "test",
173
        "note_text": "Le patient est admis pour des douleurs dans le bras droit, mais "
174
        "n'a pas de problème \n"
175
        "de locomotion. \n"
176
        "Historique d'AVC dans la famille. pourrait être un cas de "
177
        "rhume.\n"
178
        "NBNbWbWbNbWbNBNbNbWbWbNBNbWbNbNbWbNBNbWbNbNBWbWbNbNbNBWbNbWbNbWBNbNbWbNbNBNbWb"
179
        "WbNbWBNbNbWbNBNbWbWbNb\n"
180
        "Pourrait être un cas de rhume.\n"
181
        "Motif :\n"
182
        "Douleurs dans le bras droit.\n"
183
        "ANTÉCÉDENTS\n"
184
        "Le patient est déjà venu\n"
185
        "Pas d'anomalie détectée.\n",
186
    }
187
188
189
def test_read_in_worker(blank_nlp, tmpdir):
190
    input_dir = Path(__file__).parent.parent.resolve() / "resources" / "docs.jsonl"
191
    list(
192
        edsnlp.data.read_json(
193
            input_dir,
194
            converter="omop",
195
            span_attributes=["etat", "assertion"],
196
            read_in_worker=True,
197
        )
198
    )[0]
199
200
201
def test_read_to_json(blank_nlp, tmpdir):
202
    input_dir = Path(__file__).parent.parent.resolve() / "resources" / "docs.jsonl"
203
    output_dir = Path(tmpdir)
204
    doc = list(
205
        edsnlp.data.read_json(
206
            input_dir,
207
            converter="omop",
208
            span_attributes=["etat", "assertion"],
209
            doc_attributes=["context_var"],
210
        )
211
    )[0]
212
    assert_doc_read(doc)
213
    doc.ents[0]._.etat = "test"
214
215
    edsnlp.data.write_json(
216
        [doc],
217
        output_dir / "docs.jsonl",
218
        converter="omop",
219
        doc_attributes=["context_var"],
220
        span_attributes=["etat", "assertion"],
221
        span_getter=["ents", "sosy", "localisation", "anatomie", "pathologie"],
222
        lines=True,
223
    )
224
225
    with pytest.raises(FileExistsError):
226
        edsnlp.data.write_json(
227
            [doc],
228
            output_dir / "docs.jsonl",
229
            converter="omop",
230
            doc_attributes=["context_var"],
231
            span_attributes=["etat", "assertion"],
232
            span_getter=["ents", "sosy", "localisation", "anatomie", "pathologie"],
233
        )
234
235
    with open(output_dir / "docs.jsonl") as f:
236
        exported_obj = json.loads(f.readlines()[0])
237
    assert_doc_write(exported_obj)
238
239
    with pytest.raises(FileExistsError):
240
        edsnlp.data.write_json(
241
            [doc],
242
            output_dir,
243
            converter="omop",
244
            doc_attributes=["context_var"],
245
            span_attributes=["etat", "assertion"],
246
            span_getter=["ents", "sosy", "localisation", "anatomie", "pathologie"],
247
            lines=False,
248
        )
249
250
    edsnlp.data.write_json(
251
        [doc],
252
        output_dir,
253
        converter="omop",
254
        doc_attributes=["context_var"],
255
        span_attributes=["etat", "assertion"],
256
        span_getter=["ents", "sosy", "localisation", "anatomie", "pathologie"],
257
        lines=False,
258
        overwrite=True,
259
    )
260
261
    with open(output_dir / "subfolder" / "doc-1.json") as f:
262
        exported_obj = json.loads(f.read())
263
    assert_doc_write(exported_obj)
264
265
266
@pytest.mark.parametrize("num_cpu_workers", [0, 2])
267
@pytest.mark.parametrize("shuffle", ["dataset"])
268
def test_read_shuffle_loop(
269
    num_cpu_workers: int,
270
    shuffle: Literal["dataset", "fragment"],
271
):
272
    input_dir = Path(__file__).parent.parent.resolve() / "resources" / "docs.jsonl"
273
    notes = (
274
        edsnlp.data.read_json(
275
            input_dir,
276
            shuffle=shuffle,
277
            seed=42,
278
            loop=True,
279
        )
280
        .map(lambda x: x["note_id"])
281
        .set_processing(num_cpu_workers=num_cpu_workers)
282
    )
283
    notes = list(islice(notes, 6))
284
    assert notes == [
285
        "subfolder/doc-2",
286
        "subfolder/doc-1",
287
        "subfolder/doc-3",
288
        "subfolder/doc-3",
289
        "subfolder/doc-2",
290
        "subfolder/doc-1",
291
    ]