a b/tests/data/test_converters.py
1
import pytest
2
from spacy.tokens import Span
3
4
import edsnlp.data
5
from edsnlp.data.converters import (
6
    FILENAME,
7
    get_dict2doc_converter,
8
    get_doc2dict_converter,
9
)
10
11
12
@pytest.fixture(autouse=True, scope="module")
13
def set_extensions():
14
    if not Span.has_extension("negation"):
15
        Span.set_extension("negation", default=None)
16
17
18
def test_read_omop_dict(blank_nlp):
19
    json = {
20
        "note_id": 1234,
21
        "note_text": "This is a test.",
22
        "entities": [
23
            {
24
                "note_nlp_id": 0,
25
                "start_char": 0,
26
                "end_char": 4,
27
                "lexical_variant": "This",
28
                "note_nlp_source_value": "test",
29
                "negation": True,
30
            },
31
            {
32
                "note_nlp_id": 1,
33
                "start_char": 5,
34
                "end_char": 7,
35
                "lexical_variant": "is",
36
                "note_nlp_source_value": "test",
37
            },
38
        ],
39
    }
40
    doc = get_dict2doc_converter(
41
        "omop",
42
        dict(
43
            nlp=blank_nlp,
44
            span_attributes="negation",
45
            bool_attributes="negation",
46
        ),
47
    )[0](json)
48
    assert doc.text == "This is a test."
49
    assert doc._.note_id == 1234
50
    assert len(doc.ents) == 2
51
    assert doc.ents[0].text == "This"
52
    assert doc.ents[0]._.negation is True
53
    assert doc.ents[1]._.negation is False
54
55
56
def test_read_standoff_dict(blank_nlp):
57
    json = {
58
        "doc_id": 1234,
59
        "text": "This is a test.",
60
        "entities": [
61
            {
62
                "entity_id": 0,
63
                "fragments": [
64
                    {
65
                        "begin": 0,
66
                        "end": 4,
67
                    }
68
                ],
69
                "attributes": {
70
                    "negation": True,
71
                },
72
                "label": "test",
73
            },
74
            {
75
                "entity_id": 1,
76
                "fragments": [
77
                    {
78
                        "begin": 5,
79
                        "end": 7,
80
                    }
81
                ],
82
                "attributes": {},
83
                "label": "test",
84
            },
85
        ],
86
    }
87
    doc = get_dict2doc_converter(
88
        "standoff",
89
        dict(
90
            nlp=blank_nlp,
91
            span_attributes={"negation": "negation"},
92
            bool_attributes="negation",
93
        ),
94
    )[0](json)
95
    assert doc.text == "This is a test."
96
    assert doc._.note_id == 1234
97
    assert len(doc.ents) == 2
98
    assert doc.ents[0].text == "This"
99
    assert doc.ents[0]._.negation is True
100
    assert doc.ents[1]._.negation is False
101
102
103
def test_write_omop_dict(blank_nlp):
104
    doc = blank_nlp("This is a test.")
105
    doc._.note_id = 1234
106
    doc.ents = [Span(doc, 0, 1, label="test"), Span(doc, 1, 2, label="test")]
107
    doc.ents[0]._.negation = True
108
    doc.ents[1]._.negation = False
109
    json = {
110
        FILENAME: 1234,
111
        "note_id": 1234,
112
        "note_text": "This is a test.",
113
        "entities": [
114
            {
115
                "note_nlp_id": 0,
116
                "start_char": 0,
117
                "end_char": 4,
118
                "lexical_variant": "This",
119
                "note_nlp_source_value": "test",
120
                "sent.text": "This is a test.",
121
                "negation": True,
122
            },
123
            {
124
                "note_nlp_id": 1,
125
                "start_char": 5,
126
                "end_char": 7,
127
                "lexical_variant": "is",
128
                "note_nlp_source_value": "test",
129
                "sent.text": "This is a test.",
130
                "negation": False,
131
            },
132
        ],
133
    }
134
    assert (
135
        get_doc2dict_converter(
136
            "omop",
137
            dict(
138
                span_getter={"ents": True},
139
                span_attributes=["negation", "sent.text"],
140
            ),
141
        )[0](doc)
142
        == json
143
    )
144
145
146
def test_write_standoff_dict(blank_nlp):
147
    doc = blank_nlp("This is a test.")
148
    doc._.note_id = 1234
149
    doc.ents = [Span(doc, 0, 1, label="test"), Span(doc, 1, 2, label="test")]
150
    if not Span.has_extension("negation"):
151
        Span.set_extension("negation", default=None)
152
    doc.ents[0]._.negation = True
153
    doc.ents[1]._.negation = False
154
    json = {
155
        FILENAME: 1234,
156
        "doc_id": 1234,
157
        "text": "This is a test.",
158
        "entities": [
159
            {
160
                "entity_id": 0,
161
                "fragments": [
162
                    {
163
                        "begin": 0,
164
                        "end": 4,
165
                    }
166
                ],
167
                "attributes": {
168
                    "negation": True,
169
                },
170
                "label": "test",
171
            },
172
            {
173
                "entity_id": 1,
174
                "fragments": [
175
                    {
176
                        "begin": 5,
177
                        "end": 7,
178
                    }
179
                ],
180
                "attributes": {
181
                    "negation": False,
182
                },
183
                "label": "test",
184
            },
185
        ],
186
    }
187
    assert (
188
        get_doc2dict_converter(
189
            "standoff",
190
            dict(
191
                span_getter={"ents": True},
192
                span_attributes={"negation": "negation"},
193
            ),
194
        )[0](doc)
195
        == json
196
    )
197
198
199
def test_write_ents_dict(blank_nlp):
200
    doc = blank_nlp("This is a test.")
201
    doc._.note_id = 1234
202
    doc.ents = [Span(doc, 0, 1, label="test"), Span(doc, 1, 2, label="test")]
203
    doc.ents[0]._.negation = True
204
    doc.ents[1]._.negation = False
205
    jsons = [
206
        {
207
            "note_id": 1234,
208
            "start": 0,
209
            "end": 4,
210
            "lexical_variant": "This",
211
            "label": "test",
212
            "span_type": "ents",
213
            "sent.text": "This is a test.",
214
            "negation": True,
215
        },
216
        {
217
            "note_id": 1234,
218
            "start": 5,
219
            "end": 7,
220
            "lexical_variant": "is",
221
            "label": "test",
222
            "span_type": "ents",
223
            "sent.text": "This is a test.",
224
            "negation": False,
225
        },
226
    ]
227
    assert (
228
        get_doc2dict_converter(
229
            "ents",
230
            dict(
231
                span_getter={"ents": True},
232
                span_attributes=["negation", "sent.text"],
233
            ),
234
        )[0](doc)
235
        == jsons
236
    )
237
238
239
def test_unknown_converter():
240
    with pytest.raises(ValueError):
241
        get_dict2doc_converter("test", {})
242
243
    with pytest.raises(ValueError):
244
        get_doc2dict_converter("test", {})
245
246
247
def test_callable_converter():
248
    raw = lambda x: x  # noqa: E731
249
    assert get_dict2doc_converter(raw, {}) == (raw, {})
250
    assert get_doc2dict_converter(raw, {}) == (raw, {})
251
252
253
def test_method_converter(blank_nlp):
254
    data = ["Ceci", "est", "un", "test"]
255
    texts = list(
256
        edsnlp.data.from_iterable(data, converter=blank_nlp.make_doc).map(
257
            lambda x: x.text
258
        )
259
    )
260
    assert texts == data
261
262
263
def test_converter_types(blank_nlp):
264
    class Text:
265
        def __init__(self, text):
266
            self.text = text
267
268
    for converter in (blank_nlp.make_doc, Text, lambda x, k=2: Text(x)):
269
        data = ["Ceci", "est", "un", "test"]
270
        texts = list(
271
            edsnlp.data.from_iterable(data, converter=blank_nlp.make_doc).map(
272
                lambda x: x.text
273
            )
274
        )
275
        assert texts == data