Diff of /edsnlp/data/converters.py [000000] .. [cad161]

Switch to unified view

a b/edsnlp/data/converters.py
1
"""
2
Converters are used to convert documents between python dictionaries and Doc objects.
3
There are two types of converters: readers and writers. Readers convert dictionaries to
4
Doc objects, and writers convert Doc objects to dictionaries.
5
"""
6
7
import inspect
8
import warnings
9
from copy import copy
10
from types import FunctionType
11
from typing import (
12
    TYPE_CHECKING,
13
    Any,
14
    Callable,
15
    Dict,
16
    Optional,
17
    Sequence,
18
    Tuple,
19
    Union,
20
)
21
22
import pydantic
23
import spacy
24
from confit.registry import ValidatedFunction
25
from spacy.tokenizer import Tokenizer
26
from spacy.tokens import Doc, Span
27
28
import edsnlp
29
from edsnlp import registry
30
from edsnlp.core.stream import CONTEXT
31
from edsnlp.utils.bindings import BINDING_GETTERS
32
from edsnlp.utils.span_getters import (
33
    SpanGetterArg,
34
    SpanSetterArg,
35
    get_spans,
36
    get_spans_with_group,
37
    set_spans,
38
)
39
from edsnlp.utils.typing import AsList, Validated
40
41
FILENAME = "__FILENAME__"
42
SPAN_BUILTIN_ATTRS = ("sent", "label_", "kb_id_", "text")
43
44
SCHEMA = {}
45
46
_DEFAULT_TOKENIZER = None
47
48
# For backward compatibility
49
SequenceStr = AsList[str]
50
51
52
def without_filename(d):
53
    d.pop(FILENAME, None)
54
    return d
55
56
57
def validate_kwargs(func, kwargs):
58
    if (
59
        hasattr(func, "__call__")
60
        and not hasattr(func, "__defaults__")
61
        and hasattr(func.__call__, "__self__")
62
    ):
63
        func = func.__call__
64
    has_self = restore = False
65
    spec = inspect.getfullargspec(func)
66
    try:
67
        if hasattr(func, "__func__"):
68
            has_self = hasattr(func, "__self__")
69
            func = func.__func__.__get__(None, func.__func__.__class__)
70
            old_annotations = func.__annotations__
71
            old_defaults = func.__defaults__
72
            restore = True
73
            func.__annotations__ = copy(func.__annotations__)
74
            func.__annotations__[spec.args[0]] = Optional[Any]
75
            func.__annotations__[spec.args[1]] = Optional[Any]
76
            func.__defaults__ = (
77
                None,
78
                None,
79
                *(spec.defaults or ())[-len(spec.args) + 2 :],
80
            )
81
        else:
82
            func: FunctionType = copy(func)
83
            old_annotations = func.__annotations__
84
            old_defaults = func.__defaults__
85
            restore = True
86
            func.__annotations__[spec.args[0]] = Optional[Any]
87
            func.__defaults__ = (None, *(spec.defaults or ())[-len(spec.args) + 1 :])
88
        vd = ValidatedFunction(func, {"arbitrary_types_allowed": True})
89
        model = vd.init_model_instance(
90
            **{k: v for k, v in kwargs.items() if k in spec.args}
91
        )
92
        fields = (
93
            model.__fields__ if pydantic.__version__ < "2" else vd.model.model_fields
94
        )
95
        d = {
96
            k: v
97
            for k, v in model.__dict__.items()
98
            if (k in fields or fields[k].default_factory)
99
        }
100
        d.pop("v__duplicate_kwargs", None)  # see pydantic ValidatedFunction code
101
        d.pop(vd.v_args_name, None)
102
        d.pop(spec.args[0], None)
103
        if has_self:
104
            d.pop(spec.args[1], None)
105
        return {**(d.pop(vd.v_kwargs_name, None) or {}), **d}
106
    finally:
107
        if restore:
108
            func.__annotations__ = old_annotations
109
            func.__defaults__ = old_defaults
110
111
112
class AttributesMappingArg(Validated):
113
    """
114
    A span attribute mapping (can be a list too to keep the same names).
115
116
    For instance:
117
118
    - `doc_attributes="note_datetime"` will map the `note_datetime` JSON attribute to
119
      the `note_datetime` extension.
120
    - `span_attributes=["negation", "family"]` will map the `negation` and `family` JSON
121
      attributes to the `negation` and `family` extensions.
122
    """
123
124
    @classmethod
125
    def validate(cls, value, config=None) -> Dict[str, str]:
126
        return validate_attributes_mapping(value)
127
128
129
if TYPE_CHECKING:
130
    AttributesMappingArg = Union[str, Sequence[str], Dict[str, str]]  # noqa: F811
131
132
133
def validate_attributes_mapping(value: AttributesMappingArg) -> Dict[str, str]:
134
    if isinstance(value, str):
135
        return {value: value}
136
    if isinstance(value, list):
137
        return {item: item for item in value}
138
    else:
139
        return value
140
141
142
def get_current_tokenizer():
143
    global _DEFAULT_TOKENIZER
144
    if "tokenizer" in CONTEXT[0]:
145
        return CONTEXT[0]["tokenizer"]
146
    if _DEFAULT_TOKENIZER is None:
147
        _DEFAULT_TOKENIZER = edsnlp.blank("eds").tokenizer
148
    return _DEFAULT_TOKENIZER
149
150
151
@registry.factory.register("eds.standoff_dict2doc", spacy_compatible=False)
152
class StandoffDict2DocConverter:
153
    """
154
    !!! note "Why does BRAT/Standoff need a converter ?"
155
156
        You may wonder : why do I need a converter ? Since BRAT is already a NLP
157
        oriented format, it should be straightforward to convert it to a Doc object.
158
159
        Indeed, we do provide a default converter for the BRAT standoff format, but we
160
        also acknowledge that there may be more than one way to convert a standoff
161
        document to a Doc object. For instance, an annotated span may be used to
162
        represent a relation between two smaller included entities, or another entity
163
        scope, etc.
164
165
        In such cases, we recommend you use a custom converter as described
166
        [here](/data/converters/#custom-converter).
167
168
    Examples
169
    --------
170
171
    ```{ .python .no-check }
172
    # Any kind of reader (`edsnlp.data.read/from_...`) can be used here
173
    docs = edsnlp.data.read_standoff(
174
        "path/to/standoff",
175
        converter="standoff",  # set by default
176
177
        # Optional parameters
178
        tokenizer=tokenizer,
179
        span_setter={"ents": True, "*": True},
180
        span_attributes={"negation": "negated"},
181
        keep_raw_attribute_values=False,
182
        default_attributes={"negated": False, "temporality": "present"},
183
    )
184
    ```
185
186
    Parameters
187
    ----------
188
    nlp: Optional[PipelineProtocol]
189
        The pipeline object (optional and likely not needed, prefer to use the
190
        `tokenizer` directly argument instead).
191
    tokenizer: Optional[Tokenizer]
192
        The tokenizer instance used to tokenize the documents. Likely not needed since
193
        by default it uses the current context tokenizer :
194
195
        - the tokenizer of the next pipeline run by `.map_pipeline` in a
196
          [Stream][edsnlp.core.stream.Stream].
197
        - or the `eds` tokenizer by default.
198
    span_setter : SpanSetterArg
199
        The span setter to use when setting the spans in the documents. Defaults to
200
        setting the spans in the `ents` attribute, and creates a new span group for
201
        each JSON entity label.
202
    span_attributes : Optional[AttributesMappingArg]
203
        Mapping from BRAT attributes to Span extensions (can be a list too).
204
        By default, all attributes are imported as Span extensions with the same name.
205
    keep_raw_attribute_values : bool
206
        Whether to keep the raw attribute values (as strings) or to convert them to
207
        Python objects (e.g. booleans).
208
    default_attributes : AttributesMappingArg
209
        How to set attributes on spans for which no attribute value was found in the
210
        input format. This is especially useful for negation, or frequent attributes
211
        values (e.g. "negated" is often False, "temporal" is often "present"), that
212
        annotators may not want to annotate every time.
213
    notes_as_span_attribute : Optional[str]
214
        If set, the AnnotatorNote annotations will be concatenated and stored in a span
215
        attribute with this name.
216
    split_fragments : bool
217
        Whether to split the fragments into separate spans or not. If set to False, the
218
        fragments will be concatenated into a single span.
219
    """
220
221
    def __init__(
222
        self,
223
        *,
224
        tokenizer: Optional[Tokenizer] = None,
225
        span_setter: SpanSetterArg = {"ents": True, "*": True},
226
        span_attributes: Optional[AttributesMappingArg] = None,
227
        keep_raw_attribute_values: bool = False,
228
        bool_attributes: AsList[str] = [],
229
        default_attributes: AttributesMappingArg = {},
230
        notes_as_span_attribute: Optional[str] = None,
231
        split_fragments: bool = True,
232
    ):
233
        self.tokenizer = tokenizer
234
        self.span_setter = span_setter
235
        self.span_attributes = span_attributes  # type: ignore
236
        self.keep_raw_attribute_values = keep_raw_attribute_values
237
        self.default_attributes = default_attributes
238
        self.notes_as_span_attribute = notes_as_span_attribute
239
        self.split_fragments = split_fragments
240
        for attr in bool_attributes:
241
            self.default_attributes[attr] = False
242
243
    def __call__(self, obj, tokenizer=None):
244
        # tok = get_current_tokenizer() if self.tokenizer is None else self.tokenizer
245
        tok = tokenizer or self.tokenizer or get_current_tokenizer()
246
        doc = tok(obj["text"] or "")
247
        doc._.note_id = obj.get("doc_id", obj.get(FILENAME))
248
249
        spans = []
250
251
        for dst in (
252
            *(() if self.span_attributes is None else self.span_attributes.values()),
253
            *self.default_attributes,
254
        ):
255
            if not Span.has_extension(dst):
256
                Span.set_extension(dst, default=None)
257
258
        for ent in obj.get("entities") or ():
259
            fragments = (
260
                [
261
                    {
262
                        "begin": min(f["begin"] for f in ent["fragments"]),
263
                        "end": max(f["end"] for f in ent["fragments"]),
264
                    }
265
                ]
266
                if not self.split_fragments
267
                else ent["fragments"]
268
            )
269
            for fragment in fragments:
270
                span = doc.char_span(
271
                    fragment["begin"],
272
                    fragment["end"],
273
                    label=ent["label"],
274
                    alignment_mode="expand",
275
                )
276
                attributes = (
277
                    {a["label"]: a["value"] for a in ent["attributes"]}
278
                    if isinstance(ent["attributes"], list)
279
                    else ent["attributes"]
280
                )
281
                if self.notes_as_span_attribute and ent["notes"]:
282
                    ent["attributes"][self.notes_as_span_attribute] = "|".join(
283
                        note["value"] for note in ent["notes"]
284
                    )
285
                for label, value in attributes.items():
286
                    new_name = (
287
                        self.span_attributes.get(label, None)
288
                        if self.span_attributes is not None
289
                        else label
290
                    )
291
                    if self.span_attributes is None and not Span.has_extension(
292
                        new_name
293
                    ):
294
                        Span.set_extension(new_name, default=None)
295
296
                    if new_name:
297
                        value = True if value is None else value
298
                        if not self.keep_raw_attribute_values:
299
                            value = (
300
                                True
301
                                if value in ("True", "true")
302
                                else False
303
                                if value in ("False", "false")
304
                                else value
305
                            )
306
                        span._.set(new_name, value)
307
308
                spans.append(span)
309
310
        set_spans(doc, spans, span_setter=self.span_setter)
311
        for attr, value in self.default_attributes.items():
312
            for span in spans:
313
                if span._.get(attr) is None:
314
                    span._.set(attr, value)
315
316
        return doc
317
318
319
@registry.factory.register("eds.standoff_doc2dict", spacy_compatible=False)
320
class StandoffDoc2DictConverter:
321
    """
322
    Examples
323
    --------
324
325
    ```{ .python .no-check }
326
    # Any kind of writer (`edsnlp.data.read/from_...`) can be used here
327
    edsnlp.data.write_standoff(
328
        docs,
329
        converter="standoff",  # set by default
330
331
        # Optional parameters
332
        span_getter={"ents": True},
333
        span_attributes=["negation"],
334
    )
335
    # or docs.to_standoff(...) if it's already a
336
    # [Stream][edsnlp.core.stream.Stream]
337
    ```
338
339
    Parameters
340
    ----------
341
    span_getter: SpanGetterArg
342
        The span getter to use when getting the spans from the documents. Defaults to
343
        getting the spans in the `ents` attribute.
344
    span_attributes: AttributesMappingArg
345
        Mapping from Span extensions to JSON attributes (can be a list too).
346
        By default, no attribute is exported, except `note_id`.
347
    """
348
349
    def __init__(
350
        self,
351
        *,
352
        span_getter: Optional[SpanGetterArg] = {"ents": True},
353
        span_attributes: AttributesMappingArg = {},
354
    ):
355
        self.span_getter = span_getter
356
        self.span_attributes = span_attributes
357
358
    def __call__(self, doc):
359
        spans = get_spans(doc, self.span_getter)
360
        span_binding_getters = {
361
            obj_name: BINDING_GETTERS[
362
                ("_." + ext_name)
363
                if ext_name.split(".")[0] not in SPAN_BUILTIN_ATTRS
364
                else ext_name
365
            ]
366
            for ext_name, obj_name in self.span_attributes.items()
367
        }
368
        obj = {
369
            FILENAME: doc._.note_id,
370
            "doc_id": doc._.note_id,
371
            "text": doc.text,
372
            "entities": [
373
                {
374
                    "entity_id": i,
375
                    "fragments": [
376
                        {
377
                            "begin": ent.start_char,
378
                            "end": ent.end_char,
379
                        }
380
                    ],
381
                    "attributes": {
382
                        obj_name: value
383
                        for obj_name, value in (
384
                            (k, getter(ent))
385
                            for k, getter in span_binding_getters.items()
386
                        )
387
                        if value is not None
388
                    },
389
                    "label": ent.label_,
390
                }
391
                for i, ent in enumerate(sorted(dict.fromkeys(spans)))
392
            ],
393
        }
394
        return obj
395
396
397
@registry.factory.register("eds.conll_dict2doc", spacy_compatible=False)
398
class ConllDict2DocConverter:
399
    """
400
    TODO
401
    """
402
403
    def __init__(
404
        self,
405
        *,
406
        tokenizer: Optional[Tokenizer] = None,
407
    ):
408
        self.tokenizer = tokenizer
409
410
    def __call__(self, obj, tokenizer=None):
411
        tok = get_current_tokenizer() if self.tokenizer is None else self.tokenizer
412
        vocab = tok.vocab
413
        words_data = [word for word in obj["words"] if "-" not in word["ID"]]
414
        words = [word["FORM"] for word in words_data]
415
        spaces = ["SpaceAfter=No" not in w.get("MISC", "") for w in words_data]
416
        doc = Doc(vocab, words=words, spaces=spaces)
417
418
        id_to_word = {word["ID"]: i for i, word in enumerate(words_data)}
419
        for word_data, word in zip(words_data, doc):
420
            for key, value in word_data.items():
421
                if key in ("ID", "FORM", "MISC"):
422
                    pass
423
                elif key == "LEMMA":
424
                    word.lemma_ = value
425
                elif key == "UPOS":
426
                    word.pos_ = value
427
                elif key == "XPOS":
428
                    word.tag_ = value
429
                elif key == "FEATS":
430
                    word.morph = spacy.tokens.morphanalysis.MorphAnalysis(
431
                        tok.vocab,
432
                        dict(feat.split("=") for feat in value.split("|")),
433
                    )
434
                elif key == "HEAD":
435
                    if value != "0":
436
                        word.head = doc[id_to_word[value]]
437
                elif key == "DEPREL":
438
                    word.dep_ = value
439
                else:
440
                    warnings.warn(f"Unused key {key} in CoNLL dict, ignoring it.")
441
442
        return doc
443
444
445
@registry.factory.register("eds.omop_dict2doc", spacy_compatible=False)
446
class OmopDict2DocConverter:
447
    """
448
    Examples
449
    --------
450
451
    ```{ .python .no-check }
452
    # Any kind of reader (`edsnlp.data.read/from_...`) can be used here
453
    docs = edsnlp.data.from_pandas(
454
        df,
455
        converter="omop",
456
457
        # Optional parameters
458
        tokenizer=tokenizer,
459
        doc_attributes=["note_datetime"],
460
461
        # Parameters below should only matter if you plan to import entities
462
        # from the dataframe. If the data doesn't contain pre-annotated
463
        # entities, you can ignore these.
464
        span_setter={"ents": True, "*": True},
465
        span_attributes={"negation": "negated"},
466
        default_attributes={"negated": False, "temporality": "present"},
467
    )
468
    ```
469
470
    Parameters
471
    ----------
472
    nlp: Optional[PipelineProtocol]
473
        The pipeline object (optional and likely not needed, prefer to use the
474
        `tokenizer` directly argument instead).
475
    tokenizer: Optional[Tokenizer]
476
        The tokenizer instance used to tokenize the documents. Likely not needed since
477
        by default it uses the current context tokenizer :
478
479
        - the tokenizer of the next pipeline run by `.map_pipeline` in a
480
          [Stream][edsnlp.core.stream.Stream].
481
        - or the `eds` tokenizer by default.
482
    span_setter: SpanSetterArg
483
        The span setter to use when setting the spans in the documents. Defaults to
484
        setting the spans in the `ents` attribute, and creates a new span group for
485
        each JSON entity label.
486
    doc_attributes: AttributesMappingArg
487
        Mapping from JSON attributes to additional Span extensions (can be a list too).
488
        By default, all attributes are imported as Doc extensions with the same name.
489
    span_attributes: Optional[AttributesMappingArg]
490
        Mapping from JSON attributes to Span extensions (can be a list too).
491
        By default, all attributes are imported as Span extensions with the same name.
492
    default_attributes: AttributesMappingArg
493
        How to set attributes on spans for which no attribute value was found in the
494
        input format. This is especially useful for negation, or frequent attributes
495
        values (e.g. "negated" is often False, "temporal" is often "present"), that
496
        annotators may not want to annotate every time.
497
    """
498
499
    def __init__(
500
        self,
501
        *,
502
        tokenizer: Optional[Tokenizer] = None,
503
        span_setter: SpanSetterArg = {"ents": True, "*": True},
504
        doc_attributes: AttributesMappingArg = {"note_datetime": "note_datetime"},
505
        span_attributes: Optional[AttributesMappingArg] = None,
506
        default_attributes: AttributesMappingArg = {},
507
        bool_attributes: AsList[str] = [],
508
    ):
509
        self.tokenizer = tokenizer
510
        self.span_setter = span_setter
511
        self.doc_attributes = doc_attributes
512
        self.span_attributes = span_attributes
513
        self.default_attributes = default_attributes
514
        for attr in bool_attributes:
515
            self.default_attributes[attr] = False
516
517
    def __call__(self, obj, tokenizer=None):
518
        # tok = get_current_tokenizer() if self.tokenizer is None else self.tokenizer
519
        tok = tokenizer or self.tokenizer or get_current_tokenizer()
520
        doc = tok(obj["note_text"] or "")
521
        doc._.note_id = obj.get("note_id", obj.get(FILENAME))
522
        for obj_name, ext_name in self.doc_attributes.items():
523
            if not Doc.has_extension(ext_name):
524
                Doc.set_extension(ext_name, default=None)
525
            doc._.set(ext_name, obj.get(obj_name))
526
527
        spans = []
528
529
        for dst in (
530
            *(() if self.span_attributes is None else self.span_attributes.values()),
531
            *self.default_attributes,
532
        ):
533
            if not Span.has_extension(dst):
534
                Span.set_extension(dst, default=None)
535
536
        for ent in obj.get("entities") or ():
537
            ent = dict(ent)
538
            span = doc.char_span(
539
                ent.pop("start_char"),
540
                ent.pop("end_char"),
541
                label=ent.pop("note_nlp_source_value"),
542
                alignment_mode="expand",
543
            )
544
            for label, value in ent.items():
545
                new_name = (
546
                    self.span_attributes.get(label, None)
547
                    if self.span_attributes is not None
548
                    else label
549
                )
550
                if self.span_attributes is None and not Span.has_extension(new_name):
551
                    Span.set_extension(new_name, default=None)
552
553
                if new_name:
554
                    span._.set(new_name, value)
555
            spans.append(span)
556
557
        set_spans(doc, spans, span_setter=self.span_setter)
558
        for attr, value in self.default_attributes.items():
559
            for span in spans:
560
                if span._.get(attr) is None:
561
                    span._.set(attr, value)
562
        return doc
563
564
565
@registry.factory.register("eds.omop_doc2dict", spacy_compatible=False)
566
class OmopDoc2DictConverter:
567
    """
568
    Examples
569
    --------
570
571
    ```{ .python .no-check }
572
    # Any kind of writer (`edsnlp.data.write/to_...`) can be used here
573
    df = edsnlp.data.to_pandas(
574
        docs,
575
        converter="omop",
576
577
        # Optional parameters
578
        span_getter={"ents": True},
579
        doc_attributes=["note_datetime"],
580
        span_attributes=["negation", "family"],
581
    )
582
    # or docs.to_pandas(...) if it's already a
583
    # [Stream][edsnlp.core.stream.Stream]
584
    ```
585
586
    Parameters
587
    ----------
588
    span_getter: SpanGetterArg
589
        The span getter to use when getting the spans from the documents. Defaults to
590
        getting the spans in the `ents` attribute.
591
    doc_attributes: AttributesMappingArg
592
        Mapping from Doc extensions to JSON attributes (can be a list too).
593
        By default, no doc attribute is exported, except `note_id`.
594
    span_attributes: AttributesMappingArg
595
        Mapping from Span extensions to JSON attributes (can be a list too).
596
        By default, no attribute is exported.
597
    """
598
599
    def __init__(
600
        self,
601
        *,
602
        span_getter: SpanGetterArg = {"ents": True},
603
        doc_attributes: AttributesMappingArg = {},
604
        span_attributes: AttributesMappingArg = {},
605
    ):
606
        self.span_getter = span_getter
607
        self.doc_attributes = doc_attributes
608
        self.span_attributes = span_attributes
609
610
    def __call__(self, doc):
611
        spans = get_spans(doc, self.span_getter)
612
        span_binding_getters = {
613
            obj_name: BINDING_GETTERS[
614
                ("_." + ext_name)
615
                if ext_name.split(".")[0] not in SPAN_BUILTIN_ATTRS
616
                else ext_name
617
            ]
618
            for ext_name, obj_name in self.span_attributes.items()
619
        }
620
        obj = {
621
            FILENAME: doc._.note_id,
622
            "note_id": doc._.note_id,
623
            "note_text": doc.text,
624
            **{
625
                obj_name: getattr(doc._, ext_name)
626
                for ext_name, obj_name in self.doc_attributes.items()
627
                if doc._.has(ext_name)
628
            },
629
            "entities": [
630
                {
631
                    "note_nlp_id": i,
632
                    "start_char": ent.start_char,
633
                    "end_char": ent.end_char,
634
                    "lexical_variant": ent.text,
635
                    "note_nlp_source_value": ent.label_,
636
                    **{
637
                        obj_name: value
638
                        for obj_name, value in (
639
                            (k, getter(ent))
640
                            for k, getter in span_binding_getters.items()
641
                        )
642
                    },
643
                }
644
                for i, ent in enumerate(sorted(dict.fromkeys(spans)))
645
            ],
646
        }
647
        return obj
648
649
650
@registry.factory.register("eds.ents_doc2dict", spacy_compatible=False)
651
class EntsDoc2DictConverter:
652
    """
653
    Parameters
654
    ----------
655
    span_getter: SpanGetterArg
656
        The span getter to use when getting the spans from the documents. Defaults to
657
        getting the spans in the `ents` attribute.
658
    doc_attributes: AttributesMappingArg
659
        Mapping from Doc extensions to JSON attributes (can be a list too).
660
        By default, no doc attribute is exported, except `note_id`.
661
    span_attributes: AttributesMappingArg
662
        Mapping from Span extensions to JSON attributes (can be a list too).
663
        By default, no attribute is exported.
664
    """
665
666
    def __init__(
667
        self,
668
        *,
669
        span_getter: SpanGetterArg = {"ents": True},
670
        doc_attributes: AttributesMappingArg = {},
671
        span_attributes: AttributesMappingArg = {},
672
    ):
673
        self.span_getter = span_getter
674
        self.doc_attributes = doc_attributes
675
        self.span_attributes = span_attributes
676
677
    def __call__(self, doc):
678
        span_binding_getters = {
679
            obj_name: BINDING_GETTERS[
680
                ("_." + ext_name)
681
                if ext_name.split(".")[0] not in SPAN_BUILTIN_ATTRS
682
                else ext_name
683
            ]
684
            for ext_name, obj_name in self.span_attributes.items()
685
        }
686
        doc_attributes_values = {
687
            obj_name: BINDING_GETTERS["_." + ext_name](doc)
688
            for ext_name, obj_name in self.doc_attributes.items()
689
        }
690
        return [
691
            {
692
                "note_id": doc._.note_id,
693
                "start": ent.start_char,
694
                "end": ent.end_char,
695
                "label": ent.label_,
696
                "lexical_variant": ent.text,
697
                "span_type": group,  # for backward compatibility
698
                **{
699
                    obj_name: getter(ent)
700
                    for obj_name, getter in span_binding_getters.items()
701
                },
702
                **doc_attributes_values,
703
            }
704
            for ent, group in sorted(
705
                dict(get_spans_with_group(doc, self.span_getter)).items()
706
            )
707
        ]
708
709
710
def get_dict2doc_converter(
711
    converter: Union[str, Callable], kwargs
712
) -> Tuple[Callable, Dict]:
713
    if not callable(converter):
714
        available = edsnlp.registry.factory.get_available()
715
        try:
716
            filtered = [
717
                name
718
                for name in available
719
                if converter == name or (converter in name and "dict2doc" in name)
720
            ]
721
            converter = edsnlp.registry.factory.get(filtered[0])
722
            nlp = kwargs.pop("nlp", None)
723
            if nlp is not None and "tokenizer" not in kwargs:
724
                kwargs["tokenizer"] = nlp.tokenizer
725
            converter = converter(**kwargs)
726
            kwargs = {}
727
            return converter, kwargs
728
        except (KeyError, IndexError):
729
            available = [v for v in available if "dict2doc" in v]
730
            raise ValueError(
731
                f"Cannot find converter for format {converter}. "
732
                f"Available converters are {', '.join(available)}"
733
            )
734
    if isinstance(converter, type):
735
        return converter(**kwargs), {}
736
    return converter, validate_kwargs(converter, kwargs)
737
738
739
def get_doc2dict_converter(
740
    converter: Union[str, Callable], kwargs
741
) -> Tuple[Callable, Dict]:
742
    if not callable(converter):
743
        available = edsnlp.registry.factory.get_available()
744
        try:
745
            filtered = [
746
                name
747
                for name in available
748
                if converter == name or (converter in name and "doc2dict" in name)
749
            ]
750
            converter = edsnlp.registry.factory.get(filtered[0])
751
            converter = converter(**kwargs)
752
            kwargs = {}
753
            return converter, kwargs
754
        except (KeyError, IndexError):
755
            available = [v for v in available if "doc2dict" in v]
756
            raise ValueError(
757
                f"Cannot find converter for format {converter}. "
758
                f"Available converters are {', '.join(available)}"
759
            )
760
    return converter, validate_kwargs(converter, kwargs)