|
a |
|
b/tests/data/test_json.py |
|
|
1 |
import json |
|
|
2 |
from itertools import islice |
|
|
3 |
from pathlib import Path |
|
|
4 |
|
|
|
5 |
import pytest |
|
|
6 |
from typing_extensions import Literal |
|
|
7 |
|
|
|
8 |
import edsnlp |
|
|
9 |
|
|
|
10 |
|
|
|
11 |
def assert_doc_read(doc): |
|
|
12 |
assert doc._.note_id == "subfolder/doc-1" |
|
|
13 |
assert doc._.context_var == "test" |
|
|
14 |
|
|
|
15 |
attrs = ("etat", "assertion") |
|
|
16 |
spans_and_attributes = { |
|
|
17 |
"__ents__": sorted( |
|
|
18 |
[ |
|
|
19 |
(e.start, e.end, e.text, tuple(getattr(e._, key) for key in attrs)) |
|
|
20 |
for e in doc.ents |
|
|
21 |
] |
|
|
22 |
), |
|
|
23 |
**{ |
|
|
24 |
name: sorted( |
|
|
25 |
[ |
|
|
26 |
(e.start, e.end, e.text, tuple(getattr(e._, key) for key in attrs)) |
|
|
27 |
for e in doc.spans[name] |
|
|
28 |
] |
|
|
29 |
) |
|
|
30 |
for name in doc.spans |
|
|
31 |
}, |
|
|
32 |
} |
|
|
33 |
|
|
|
34 |
assert spans_and_attributes == { |
|
|
35 |
"__ents__": [ |
|
|
36 |
(6, 7, "douleurs", (None, None)), |
|
|
37 |
(7, 11, "dans le bras droit", (None, None)), |
|
|
38 |
(17, 21, "problème \nde locomotion", (None, "absent")), |
|
|
39 |
(25, 26, "AVC", ("passé", "non-associé")), |
|
|
40 |
(35, 36, "rhume", ("présent", "hypothétique")), |
|
|
41 |
(45, 46, "rhume", ("présent", "hypothétique")), |
|
|
42 |
(51, 52, "Douleurs", (None, None)), |
|
|
43 |
(52, 56, "dans le bras droit", (None, None)), |
|
|
44 |
(68, 69, "anomalie", (None, "absent")), |
|
|
45 |
], |
|
|
46 |
"anatomie": [ |
|
|
47 |
(9, 11, "bras droit", (None, None)), |
|
|
48 |
(54, 56, "bras droit", (None, None)), |
|
|
49 |
], |
|
|
50 |
"localisation": [ |
|
|
51 |
(7, 11, "dans le bras droit", (None, None)), |
|
|
52 |
(52, 56, "dans le bras droit", (None, None)), |
|
|
53 |
], |
|
|
54 |
"pathologie": [ |
|
|
55 |
(17, 21, "problème \nde locomotion", (None, "absent")), |
|
|
56 |
(25, 26, "AVC", ("passé", "non-associé")), |
|
|
57 |
(35, 36, "rhume", ("présent", "hypothétique")), |
|
|
58 |
(45, 46, "rhume", ("présent", "hypothétique")), |
|
|
59 |
], |
|
|
60 |
"sosy": [ |
|
|
61 |
(6, 7, "douleurs", (None, None)), |
|
|
62 |
(51, 52, "Douleurs", (None, None)), |
|
|
63 |
(68, 69, "anomalie", (None, "absent")), |
|
|
64 |
], |
|
|
65 |
} |
|
|
66 |
|
|
|
67 |
|
|
|
68 |
def assert_doc_write(exported_obj): |
|
|
69 |
assert exported_obj == { |
|
|
70 |
"entities": [ |
|
|
71 |
{ |
|
|
72 |
"assertion": None, |
|
|
73 |
"end_char": 38, |
|
|
74 |
"etat": "test", |
|
|
75 |
"lexical_variant": "douleurs", |
|
|
76 |
"note_nlp_id": 0, |
|
|
77 |
"note_nlp_source_value": "sosy", |
|
|
78 |
"start_char": 30, |
|
|
79 |
}, |
|
|
80 |
{ |
|
|
81 |
"assertion": None, |
|
|
82 |
"end_char": 57, |
|
|
83 |
"etat": None, |
|
|
84 |
"lexical_variant": "dans le bras droit", |
|
|
85 |
"note_nlp_id": 1, |
|
|
86 |
"note_nlp_source_value": "localisation", |
|
|
87 |
"start_char": 39, |
|
|
88 |
}, |
|
|
89 |
{ |
|
|
90 |
"assertion": None, |
|
|
91 |
"end_char": 57, |
|
|
92 |
"etat": None, |
|
|
93 |
"lexical_variant": "bras droit", |
|
|
94 |
"note_nlp_id": 2, |
|
|
95 |
"note_nlp_source_value": "anatomie", |
|
|
96 |
"start_char": 47, |
|
|
97 |
}, |
|
|
98 |
{ |
|
|
99 |
"assertion": "absent", |
|
|
100 |
"end_char": 98, |
|
|
101 |
"etat": None, |
|
|
102 |
"lexical_variant": "problème \nde locomotion", |
|
|
103 |
"note_nlp_id": 3, |
|
|
104 |
"note_nlp_source_value": "pathologie", |
|
|
105 |
"start_char": 75, |
|
|
106 |
}, |
|
|
107 |
{ |
|
|
108 |
"assertion": "non-associé", |
|
|
109 |
"end_char": 117, |
|
|
110 |
"etat": "passé", |
|
|
111 |
"lexical_variant": "AVC", |
|
|
112 |
"note_nlp_id": 4, |
|
|
113 |
"note_nlp_source_value": "pathologie", |
|
|
114 |
"start_char": 114, |
|
|
115 |
}, |
|
|
116 |
{ |
|
|
117 |
"assertion": "hypothétique", |
|
|
118 |
"end_char": 164, |
|
|
119 |
"etat": "présent", |
|
|
120 |
"lexical_variant": "rhume", |
|
|
121 |
"note_nlp_id": 5, |
|
|
122 |
"note_nlp_source_value": "pathologie", |
|
|
123 |
"start_char": 159, |
|
|
124 |
}, |
|
|
125 |
{ |
|
|
126 |
"assertion": "hypothétique", |
|
|
127 |
"end_char": 296, |
|
|
128 |
"etat": "présent", |
|
|
129 |
"lexical_variant": "rhume", |
|
|
130 |
"note_nlp_id": 6, |
|
|
131 |
"note_nlp_source_value": "pathologie", |
|
|
132 |
"start_char": 291, |
|
|
133 |
}, |
|
|
134 |
{ |
|
|
135 |
"assertion": None, |
|
|
136 |
"end_char": 314, |
|
|
137 |
"etat": None, |
|
|
138 |
"lexical_variant": "Douleurs", |
|
|
139 |
"note_nlp_id": 7, |
|
|
140 |
"note_nlp_source_value": "sosy", |
|
|
141 |
"start_char": 306, |
|
|
142 |
}, |
|
|
143 |
{ |
|
|
144 |
"assertion": None, |
|
|
145 |
"end_char": 333, |
|
|
146 |
"etat": None, |
|
|
147 |
"lexical_variant": "dans le bras droit", |
|
|
148 |
"note_nlp_id": 8, |
|
|
149 |
"note_nlp_source_value": "localisation", |
|
|
150 |
"start_char": 315, |
|
|
151 |
}, |
|
|
152 |
{ |
|
|
153 |
"assertion": None, |
|
|
154 |
"end_char": 333, |
|
|
155 |
"etat": None, |
|
|
156 |
"lexical_variant": "bras droit", |
|
|
157 |
"note_nlp_id": 9, |
|
|
158 |
"note_nlp_source_value": "anatomie", |
|
|
159 |
"start_char": 323, |
|
|
160 |
}, |
|
|
161 |
{ |
|
|
162 |
"assertion": "absent", |
|
|
163 |
"end_char": 386, |
|
|
164 |
"etat": None, |
|
|
165 |
"lexical_variant": "anomalie", |
|
|
166 |
"note_nlp_id": 10, |
|
|
167 |
"note_nlp_source_value": "sosy", |
|
|
168 |
"start_char": 378, |
|
|
169 |
}, |
|
|
170 |
], |
|
|
171 |
"note_id": "subfolder/doc-1", |
|
|
172 |
"context_var": "test", |
|
|
173 |
"note_text": "Le patient est admis pour des douleurs dans le bras droit, mais " |
|
|
174 |
"n'a pas de problème \n" |
|
|
175 |
"de locomotion. \n" |
|
|
176 |
"Historique d'AVC dans la famille. pourrait être un cas de " |
|
|
177 |
"rhume.\n" |
|
|
178 |
"NBNbWbWbNbWbNBNbNbWbWbNBNbWbNbNbWbNBNbWbNbNBWbWbNbNbNBWbNbWbNbWBNbNbWbNbNBNbWb" |
|
|
179 |
"WbNbWBNbNbWbNBNbWbWbNb\n" |
|
|
180 |
"Pourrait être un cas de rhume.\n" |
|
|
181 |
"Motif :\n" |
|
|
182 |
"Douleurs dans le bras droit.\n" |
|
|
183 |
"ANTÉCÉDENTS\n" |
|
|
184 |
"Le patient est déjà venu\n" |
|
|
185 |
"Pas d'anomalie détectée.\n", |
|
|
186 |
} |
|
|
187 |
|
|
|
188 |
|
|
|
189 |
def test_read_in_worker(blank_nlp, tmpdir): |
|
|
190 |
input_dir = Path(__file__).parent.parent.resolve() / "resources" / "docs.jsonl" |
|
|
191 |
list( |
|
|
192 |
edsnlp.data.read_json( |
|
|
193 |
input_dir, |
|
|
194 |
converter="omop", |
|
|
195 |
span_attributes=["etat", "assertion"], |
|
|
196 |
read_in_worker=True, |
|
|
197 |
) |
|
|
198 |
)[0] |
|
|
199 |
|
|
|
200 |
|
|
|
201 |
def test_read_to_json(blank_nlp, tmpdir): |
|
|
202 |
input_dir = Path(__file__).parent.parent.resolve() / "resources" / "docs.jsonl" |
|
|
203 |
output_dir = Path(tmpdir) |
|
|
204 |
doc = list( |
|
|
205 |
edsnlp.data.read_json( |
|
|
206 |
input_dir, |
|
|
207 |
converter="omop", |
|
|
208 |
span_attributes=["etat", "assertion"], |
|
|
209 |
doc_attributes=["context_var"], |
|
|
210 |
) |
|
|
211 |
)[0] |
|
|
212 |
assert_doc_read(doc) |
|
|
213 |
doc.ents[0]._.etat = "test" |
|
|
214 |
|
|
|
215 |
edsnlp.data.write_json( |
|
|
216 |
[doc], |
|
|
217 |
output_dir / "docs.jsonl", |
|
|
218 |
converter="omop", |
|
|
219 |
doc_attributes=["context_var"], |
|
|
220 |
span_attributes=["etat", "assertion"], |
|
|
221 |
span_getter=["ents", "sosy", "localisation", "anatomie", "pathologie"], |
|
|
222 |
lines=True, |
|
|
223 |
) |
|
|
224 |
|
|
|
225 |
with pytest.raises(FileExistsError): |
|
|
226 |
edsnlp.data.write_json( |
|
|
227 |
[doc], |
|
|
228 |
output_dir / "docs.jsonl", |
|
|
229 |
converter="omop", |
|
|
230 |
doc_attributes=["context_var"], |
|
|
231 |
span_attributes=["etat", "assertion"], |
|
|
232 |
span_getter=["ents", "sosy", "localisation", "anatomie", "pathologie"], |
|
|
233 |
) |
|
|
234 |
|
|
|
235 |
with open(output_dir / "docs.jsonl") as f: |
|
|
236 |
exported_obj = json.loads(f.readlines()[0]) |
|
|
237 |
assert_doc_write(exported_obj) |
|
|
238 |
|
|
|
239 |
with pytest.raises(FileExistsError): |
|
|
240 |
edsnlp.data.write_json( |
|
|
241 |
[doc], |
|
|
242 |
output_dir, |
|
|
243 |
converter="omop", |
|
|
244 |
doc_attributes=["context_var"], |
|
|
245 |
span_attributes=["etat", "assertion"], |
|
|
246 |
span_getter=["ents", "sosy", "localisation", "anatomie", "pathologie"], |
|
|
247 |
lines=False, |
|
|
248 |
) |
|
|
249 |
|
|
|
250 |
edsnlp.data.write_json( |
|
|
251 |
[doc], |
|
|
252 |
output_dir, |
|
|
253 |
converter="omop", |
|
|
254 |
doc_attributes=["context_var"], |
|
|
255 |
span_attributes=["etat", "assertion"], |
|
|
256 |
span_getter=["ents", "sosy", "localisation", "anatomie", "pathologie"], |
|
|
257 |
lines=False, |
|
|
258 |
overwrite=True, |
|
|
259 |
) |
|
|
260 |
|
|
|
261 |
with open(output_dir / "subfolder" / "doc-1.json") as f: |
|
|
262 |
exported_obj = json.loads(f.read()) |
|
|
263 |
assert_doc_write(exported_obj) |
|
|
264 |
|
|
|
265 |
|
|
|
266 |
@pytest.mark.parametrize("num_cpu_workers", [0, 2]) |
|
|
267 |
@pytest.mark.parametrize("shuffle", ["dataset"]) |
|
|
268 |
def test_read_shuffle_loop( |
|
|
269 |
num_cpu_workers: int, |
|
|
270 |
shuffle: Literal["dataset", "fragment"], |
|
|
271 |
): |
|
|
272 |
input_dir = Path(__file__).parent.parent.resolve() / "resources" / "docs.jsonl" |
|
|
273 |
notes = ( |
|
|
274 |
edsnlp.data.read_json( |
|
|
275 |
input_dir, |
|
|
276 |
shuffle=shuffle, |
|
|
277 |
seed=42, |
|
|
278 |
loop=True, |
|
|
279 |
) |
|
|
280 |
.map(lambda x: x["note_id"]) |
|
|
281 |
.set_processing(num_cpu_workers=num_cpu_workers) |
|
|
282 |
) |
|
|
283 |
notes = list(islice(notes, 6)) |
|
|
284 |
assert notes == [ |
|
|
285 |
"subfolder/doc-2", |
|
|
286 |
"subfolder/doc-1", |
|
|
287 |
"subfolder/doc-3", |
|
|
288 |
"subfolder/doc-3", |
|
|
289 |
"subfolder/doc-2", |
|
|
290 |
"subfolder/doc-1", |
|
|
291 |
] |