a b/edsnlp/connectors/brat.py
1
import warnings
2
from pathlib import Path
3
from typing import List, Optional, Union
4
5
from spacy.tokens import Doc
6
7
from edsnlp.core import PipelineProtocol
8
from edsnlp.data.converters import AttributesMappingArg
9
from edsnlp.data.standoff import (
10
    dump_standoff_file,
11
    parse_standoff_file,
12
    read_standoff,
13
    write_standoff,
14
)
15
from edsnlp.utils.span_getters import (
16
    SpanSetterArg,
17
    validate_span_setter,
18
)
19
20
21
class BratConnector(object):
22
    """
23
    Deprecated. Use `edsnlp.data.read_standoff` and `edsnlp.data.write_standoff`
24
    instead.
25
    Two-way connector with BRAT. Supports entities only.
26
27
    Parameters
28
    ----------
29
    directory : Union[str, Path]
30
        Directory containing the BRAT files.
31
    n_jobs : int, optional
32
        Number of jobs for multiprocessing, by default 1
33
    attributes: Optional[Union[Sequence[str], Mapping[str, str]]]
34
        Mapping from BRAT attributes to spaCy Span extensions.
35
        Extensions / attributes that are not in the mapping are not imported or exported
36
        If left to None, the mapping is filled with all BRAT attributes.
37
    span_groups: Optional[Sequence[str]]
38
        Additional span groups to look for entities in spaCy documents when exporting.
39
        Missing label (resp. span group) names are not imported (resp. exported)
40
        If left to None, the sequence is filled with all BRAT entity labels.
41
    """
42
43
    def __init__(
44
        self,
45
        directory: Union[str, Path],
46
        n_jobs: int = 1,
47
        attributes: Optional[AttributesMappingArg] = None,
48
        bool_attributes: Optional[List[str]] = [],
49
        span_groups: SpanSetterArg = ["ents", "*"],
50
        keep_raw_attribute_values: bool = False,
51
    ):
52
        warnings.warn(
53
            "This connector is deprecated and will be removed in a future version.\n"
54
            "Use `edsnlp.data.read_standoff` and `edsnlp.data.write_standoff` instead.",
55
            DeprecationWarning,
56
        )
57
        self.directory: Path = Path(directory)
58
        self.attr_map = attributes
59
        self.span_setter = validate_span_setter(span_groups)
60
        self.keep_raw_attribute_values = keep_raw_attribute_values
61
        self.bool_attributes = list(bool_attributes)
62
63
    def brat2docs(self, nlp: PipelineProtocol, run_pipe=False) -> List[Doc]:
64
        res = read_standoff(
65
            path=self.directory,
66
            nlp=nlp,
67
            keep_txt_only_docs=True,
68
            span_attributes=self.attr_map,
69
            span_setter=self.span_setter,
70
            keep_raw_attribute_values=self.keep_raw_attribute_values,
71
            bool_attributes=self.bool_attributes,
72
        )
73
        return list(nlp.pipe(res) if run_pipe else res)
74
75
    def docs2brat(self, docs: List[Doc]) -> None:
76
        """
77
        Writes a list of spaCy documents to file.
78
        """
79
        write_standoff(
80
            docs,
81
            self.directory,
82
            span_getter=self.span_setter,
83
            span_attributes=self.attr_map or {},
84
            overwrite=True,
85
        )
86
87
88
load_from_brat = parse_standoff_file
89
export_to_brat = dump_standoff_file