|
a |
|
b/tests/data/test_converters.py |
|
|
1 |
import pytest |
|
|
2 |
from spacy.tokens import Span |
|
|
3 |
|
|
|
4 |
import edsnlp.data |
|
|
5 |
from edsnlp.data.converters import ( |
|
|
6 |
FILENAME, |
|
|
7 |
get_dict2doc_converter, |
|
|
8 |
get_doc2dict_converter, |
|
|
9 |
) |
|
|
10 |
|
|
|
11 |
|
|
|
12 |
@pytest.fixture(autouse=True, scope="module") |
|
|
13 |
def set_extensions(): |
|
|
14 |
if not Span.has_extension("negation"): |
|
|
15 |
Span.set_extension("negation", default=None) |
|
|
16 |
|
|
|
17 |
|
|
|
18 |
def test_read_omop_dict(blank_nlp): |
|
|
19 |
json = { |
|
|
20 |
"note_id": 1234, |
|
|
21 |
"note_text": "This is a test.", |
|
|
22 |
"entities": [ |
|
|
23 |
{ |
|
|
24 |
"note_nlp_id": 0, |
|
|
25 |
"start_char": 0, |
|
|
26 |
"end_char": 4, |
|
|
27 |
"lexical_variant": "This", |
|
|
28 |
"note_nlp_source_value": "test", |
|
|
29 |
"negation": True, |
|
|
30 |
}, |
|
|
31 |
{ |
|
|
32 |
"note_nlp_id": 1, |
|
|
33 |
"start_char": 5, |
|
|
34 |
"end_char": 7, |
|
|
35 |
"lexical_variant": "is", |
|
|
36 |
"note_nlp_source_value": "test", |
|
|
37 |
}, |
|
|
38 |
], |
|
|
39 |
} |
|
|
40 |
doc = get_dict2doc_converter( |
|
|
41 |
"omop", |
|
|
42 |
dict( |
|
|
43 |
nlp=blank_nlp, |
|
|
44 |
span_attributes="negation", |
|
|
45 |
bool_attributes="negation", |
|
|
46 |
), |
|
|
47 |
)[0](json) |
|
|
48 |
assert doc.text == "This is a test." |
|
|
49 |
assert doc._.note_id == 1234 |
|
|
50 |
assert len(doc.ents) == 2 |
|
|
51 |
assert doc.ents[0].text == "This" |
|
|
52 |
assert doc.ents[0]._.negation is True |
|
|
53 |
assert doc.ents[1]._.negation is False |
|
|
54 |
|
|
|
55 |
|
|
|
56 |
def test_read_standoff_dict(blank_nlp): |
|
|
57 |
json = { |
|
|
58 |
"doc_id": 1234, |
|
|
59 |
"text": "This is a test.", |
|
|
60 |
"entities": [ |
|
|
61 |
{ |
|
|
62 |
"entity_id": 0, |
|
|
63 |
"fragments": [ |
|
|
64 |
{ |
|
|
65 |
"begin": 0, |
|
|
66 |
"end": 4, |
|
|
67 |
} |
|
|
68 |
], |
|
|
69 |
"attributes": { |
|
|
70 |
"negation": True, |
|
|
71 |
}, |
|
|
72 |
"label": "test", |
|
|
73 |
}, |
|
|
74 |
{ |
|
|
75 |
"entity_id": 1, |
|
|
76 |
"fragments": [ |
|
|
77 |
{ |
|
|
78 |
"begin": 5, |
|
|
79 |
"end": 7, |
|
|
80 |
} |
|
|
81 |
], |
|
|
82 |
"attributes": {}, |
|
|
83 |
"label": "test", |
|
|
84 |
}, |
|
|
85 |
], |
|
|
86 |
} |
|
|
87 |
doc = get_dict2doc_converter( |
|
|
88 |
"standoff", |
|
|
89 |
dict( |
|
|
90 |
nlp=blank_nlp, |
|
|
91 |
span_attributes={"negation": "negation"}, |
|
|
92 |
bool_attributes="negation", |
|
|
93 |
), |
|
|
94 |
)[0](json) |
|
|
95 |
assert doc.text == "This is a test." |
|
|
96 |
assert doc._.note_id == 1234 |
|
|
97 |
assert len(doc.ents) == 2 |
|
|
98 |
assert doc.ents[0].text == "This" |
|
|
99 |
assert doc.ents[0]._.negation is True |
|
|
100 |
assert doc.ents[1]._.negation is False |
|
|
101 |
|
|
|
102 |
|
|
|
103 |
def test_write_omop_dict(blank_nlp): |
|
|
104 |
doc = blank_nlp("This is a test.") |
|
|
105 |
doc._.note_id = 1234 |
|
|
106 |
doc.ents = [Span(doc, 0, 1, label="test"), Span(doc, 1, 2, label="test")] |
|
|
107 |
doc.ents[0]._.negation = True |
|
|
108 |
doc.ents[1]._.negation = False |
|
|
109 |
json = { |
|
|
110 |
FILENAME: 1234, |
|
|
111 |
"note_id": 1234, |
|
|
112 |
"note_text": "This is a test.", |
|
|
113 |
"entities": [ |
|
|
114 |
{ |
|
|
115 |
"note_nlp_id": 0, |
|
|
116 |
"start_char": 0, |
|
|
117 |
"end_char": 4, |
|
|
118 |
"lexical_variant": "This", |
|
|
119 |
"note_nlp_source_value": "test", |
|
|
120 |
"sent.text": "This is a test.", |
|
|
121 |
"negation": True, |
|
|
122 |
}, |
|
|
123 |
{ |
|
|
124 |
"note_nlp_id": 1, |
|
|
125 |
"start_char": 5, |
|
|
126 |
"end_char": 7, |
|
|
127 |
"lexical_variant": "is", |
|
|
128 |
"note_nlp_source_value": "test", |
|
|
129 |
"sent.text": "This is a test.", |
|
|
130 |
"negation": False, |
|
|
131 |
}, |
|
|
132 |
], |
|
|
133 |
} |
|
|
134 |
assert ( |
|
|
135 |
get_doc2dict_converter( |
|
|
136 |
"omop", |
|
|
137 |
dict( |
|
|
138 |
span_getter={"ents": True}, |
|
|
139 |
span_attributes=["negation", "sent.text"], |
|
|
140 |
), |
|
|
141 |
)[0](doc) |
|
|
142 |
== json |
|
|
143 |
) |
|
|
144 |
|
|
|
145 |
|
|
|
146 |
def test_write_standoff_dict(blank_nlp): |
|
|
147 |
doc = blank_nlp("This is a test.") |
|
|
148 |
doc._.note_id = 1234 |
|
|
149 |
doc.ents = [Span(doc, 0, 1, label="test"), Span(doc, 1, 2, label="test")] |
|
|
150 |
if not Span.has_extension("negation"): |
|
|
151 |
Span.set_extension("negation", default=None) |
|
|
152 |
doc.ents[0]._.negation = True |
|
|
153 |
doc.ents[1]._.negation = False |
|
|
154 |
json = { |
|
|
155 |
FILENAME: 1234, |
|
|
156 |
"doc_id": 1234, |
|
|
157 |
"text": "This is a test.", |
|
|
158 |
"entities": [ |
|
|
159 |
{ |
|
|
160 |
"entity_id": 0, |
|
|
161 |
"fragments": [ |
|
|
162 |
{ |
|
|
163 |
"begin": 0, |
|
|
164 |
"end": 4, |
|
|
165 |
} |
|
|
166 |
], |
|
|
167 |
"attributes": { |
|
|
168 |
"negation": True, |
|
|
169 |
}, |
|
|
170 |
"label": "test", |
|
|
171 |
}, |
|
|
172 |
{ |
|
|
173 |
"entity_id": 1, |
|
|
174 |
"fragments": [ |
|
|
175 |
{ |
|
|
176 |
"begin": 5, |
|
|
177 |
"end": 7, |
|
|
178 |
} |
|
|
179 |
], |
|
|
180 |
"attributes": { |
|
|
181 |
"negation": False, |
|
|
182 |
}, |
|
|
183 |
"label": "test", |
|
|
184 |
}, |
|
|
185 |
], |
|
|
186 |
} |
|
|
187 |
assert ( |
|
|
188 |
get_doc2dict_converter( |
|
|
189 |
"standoff", |
|
|
190 |
dict( |
|
|
191 |
span_getter={"ents": True}, |
|
|
192 |
span_attributes={"negation": "negation"}, |
|
|
193 |
), |
|
|
194 |
)[0](doc) |
|
|
195 |
== json |
|
|
196 |
) |
|
|
197 |
|
|
|
198 |
|
|
|
199 |
def test_write_ents_dict(blank_nlp): |
|
|
200 |
doc = blank_nlp("This is a test.") |
|
|
201 |
doc._.note_id = 1234 |
|
|
202 |
doc.ents = [Span(doc, 0, 1, label="test"), Span(doc, 1, 2, label="test")] |
|
|
203 |
doc.ents[0]._.negation = True |
|
|
204 |
doc.ents[1]._.negation = False |
|
|
205 |
jsons = [ |
|
|
206 |
{ |
|
|
207 |
"note_id": 1234, |
|
|
208 |
"start": 0, |
|
|
209 |
"end": 4, |
|
|
210 |
"lexical_variant": "This", |
|
|
211 |
"label": "test", |
|
|
212 |
"span_type": "ents", |
|
|
213 |
"sent.text": "This is a test.", |
|
|
214 |
"negation": True, |
|
|
215 |
}, |
|
|
216 |
{ |
|
|
217 |
"note_id": 1234, |
|
|
218 |
"start": 5, |
|
|
219 |
"end": 7, |
|
|
220 |
"lexical_variant": "is", |
|
|
221 |
"label": "test", |
|
|
222 |
"span_type": "ents", |
|
|
223 |
"sent.text": "This is a test.", |
|
|
224 |
"negation": False, |
|
|
225 |
}, |
|
|
226 |
] |
|
|
227 |
assert ( |
|
|
228 |
get_doc2dict_converter( |
|
|
229 |
"ents", |
|
|
230 |
dict( |
|
|
231 |
span_getter={"ents": True}, |
|
|
232 |
span_attributes=["negation", "sent.text"], |
|
|
233 |
), |
|
|
234 |
)[0](doc) |
|
|
235 |
== jsons |
|
|
236 |
) |
|
|
237 |
|
|
|
238 |
|
|
|
239 |
def test_unknown_converter(): |
|
|
240 |
with pytest.raises(ValueError): |
|
|
241 |
get_dict2doc_converter("test", {}) |
|
|
242 |
|
|
|
243 |
with pytest.raises(ValueError): |
|
|
244 |
get_doc2dict_converter("test", {}) |
|
|
245 |
|
|
|
246 |
|
|
|
247 |
def test_callable_converter(): |
|
|
248 |
raw = lambda x: x # noqa: E731 |
|
|
249 |
assert get_dict2doc_converter(raw, {}) == (raw, {}) |
|
|
250 |
assert get_doc2dict_converter(raw, {}) == (raw, {}) |
|
|
251 |
|
|
|
252 |
|
|
|
253 |
def test_method_converter(blank_nlp): |
|
|
254 |
data = ["Ceci", "est", "un", "test"] |
|
|
255 |
texts = list( |
|
|
256 |
edsnlp.data.from_iterable(data, converter=blank_nlp.make_doc).map( |
|
|
257 |
lambda x: x.text |
|
|
258 |
) |
|
|
259 |
) |
|
|
260 |
assert texts == data |
|
|
261 |
|
|
|
262 |
|
|
|
263 |
def test_converter_types(blank_nlp): |
|
|
264 |
class Text: |
|
|
265 |
def __init__(self, text): |
|
|
266 |
self.text = text |
|
|
267 |
|
|
|
268 |
for converter in (blank_nlp.make_doc, Text, lambda x, k=2: Text(x)): |
|
|
269 |
data = ["Ceci", "est", "un", "test"] |
|
|
270 |
texts = list( |
|
|
271 |
edsnlp.data.from_iterable(data, converter=blank_nlp.make_doc).map( |
|
|
272 |
lambda x: x.text |
|
|
273 |
) |
|
|
274 |
) |
|
|
275 |
assert texts == data |