Diff of /src/models/entity.py [000000] .. [735bb5]

Switch to unified view

a b/src/models/entity.py
1
# Base Dependencies
2
# -----------------
3
from typing import List
4
5
# Local Dependencies 
6
# ------------------
7
from nlp_pipeline import get_pipeline
8
9
# 3rd-Party Dependencies
10
# ----------------------
11
from spacy.language import Language
12
from spacy.tokens import Doc
13
14
15
class Entity:
16
    """
17
    Entity
18
19
    Representation of a biomedical entity.
20
    """
21
22
    # Spacy's pipeline
23
    NLP: Language = None
24
25
    def __init__(
26
        self, id: str, text: str, type: str, doc_id: str, start: int, end: int
27
    ):
28
        """
29
        Args:
30
            id (str): identifier
31
            text (str): text
32
            type (str): entity type
33
            doc_id (str): the identifier of the document the entity belongs to
34
            start (int): start character in the sentence
35
            end (int): end character in the sentence
36
        """
37
        self.id = id
38
        self.type = type.strip()
39
        self.text = text.strip()
40
        self.doc_id = doc_id
41
        self.start = start
42
        self.end = end
43
        self._tokens = None
44
45
    def __len__(self) -> int:
46
        return len(self.text)
47
48
    def __str__(self) -> str:
49
        return "Entity(id: {}, type: {}, text: {}, start: {}, end: {})".format(
50
            self.id, self.type, self.text, self.start, self.end
51
        )
52
53
    @property
54
    def uid(self) -> str:
55
        """Unique idenfitifer"""
56
        return "{}-{}".format(self.doc_id, self.id)
57
58
    @property
59
    def tokens(self) -> Doc:
60
        """Obtains the tokenized text of the entity's text
61
62
        Returns:
63
            Doc: processed text through Spacy's pipeline
64
        """
65
        if self._tokens is None:
66
            self._tokens = Entity.tokenize(self.text)
67
        return self._tokens
68
69
    # Class Methods
70
    # -------------
71
    @classmethod
72
    def set_nlp(cls, nlp: Language):
73
        """Sets the Entity Class' Spacy's pipeline
74
75
        Args:
76
            nlp (Language): pipeline
77
        """
78
        cls.NLP = nlp
79
80
    @classmethod
81
    def tokenize(cls, text: str, disable: List[str] = ["parser", "negex"]) -> Doc:
82
        """Tokenizes a text fragment with the configured Spacy's pipeline
83
84
        Args:
85
            text (str): text fragment
86
            disable (List[str], optional): pipes of the Spacy's pipeline to be disabled. Defaults to ["parser"].
87
88
        Returns:
89
            Doc: tokenized text
90
        """
91
        if cls.NLP is None:
92
            cls.NLP = get_pipeline()
93
94
        with cls.NLP.select_pipes(disable=disable):
95
            doc = cls.NLP(text)
96
        return doc
97
98
    @classmethod
99
    def from_n2c2_annotation(cls, doc_id: str, annotation: str) -> "Entity":
100
        """Creates an Entity instance from an n2c2 annotation line
101
102
        Args:
103
            doc_id (str): the identifier of the document the entity belongs to
104
            annotation (str): the entity description in the n2c2 corpus' format
105
106
        Returns:
107
            Entity: the annotated entity
108
109
        """
110
        id, definition, text = annotation.strip().split("\t")
111
        definition = definition.split()  # definition: entity type and location in text
112
        type = definition[0]
113
        start = int(definition[1])
114
        end = int(definition[-1])
115
116
        return cls(id, text, type, doc_id, start, end)
117
118
    @classmethod
119
    def from_ddi_annotation(cls, doc_id: str, annotation: dict) -> "Entity":
120
        """Creates an Entity instance from an ddi xml annotation
121
122
        Args:
123
            doc_id (str): the identifier of the document the entity belongs to
124
            annotation (dict): the entity description in the DDi Extraction Corpus' format
125
126
        Returns:
127
            Entity: the annotated entity
128
        """
129
        id = annotation["id"]
130
        type = annotation["type"].upper()
131
        text = annotation["text"]
132
        char_offset = annotation["charOffset"].split("-")
133
        start = int(char_offset[0])
134
        end = int(char_offset[-1]) + 1
135
136
        return cls(id, text, type, doc_id, start, end)
137
138
    # Instance methods
139
    # ----------------
140
    def todict(self) -> dict:
141
        """Dict representation of an entity
142
143
        Returns:
144
            dict: representation of the Entity
145
        """
146
        return {
147
            "id": self.id,
148
            "type": self.type,
149
            "text": self.text,
150
            "doc_id": self.doc_id,
151
            "start": self.start,
152
            "end": self.end,
153
        }