[735bb5]: / src / models / entity.py

Download this file

154 lines (124 with data), 4.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# Base Dependencies
# -----------------
from typing import List
# Local Dependencies
# ------------------
from nlp_pipeline import get_pipeline
# 3rd-Party Dependencies
# ----------------------
from spacy.language import Language
from spacy.tokens import Doc
class Entity:
"""
Entity
Representation of a biomedical entity.
"""
# Spacy's pipeline
NLP: Language = None
def __init__(
self, id: str, text: str, type: str, doc_id: str, start: int, end: int
):
"""
Args:
id (str): identifier
text (str): text
type (str): entity type
doc_id (str): the identifier of the document the entity belongs to
start (int): start character in the sentence
end (int): end character in the sentence
"""
self.id = id
self.type = type.strip()
self.text = text.strip()
self.doc_id = doc_id
self.start = start
self.end = end
self._tokens = None
def __len__(self) -> int:
return len(self.text)
def __str__(self) -> str:
return "Entity(id: {}, type: {}, text: {}, start: {}, end: {})".format(
self.id, self.type, self.text, self.start, self.end
)
@property
def uid(self) -> str:
"""Unique idenfitifer"""
return "{}-{}".format(self.doc_id, self.id)
@property
def tokens(self) -> Doc:
"""Obtains the tokenized text of the entity's text
Returns:
Doc: processed text through Spacy's pipeline
"""
if self._tokens is None:
self._tokens = Entity.tokenize(self.text)
return self._tokens
# Class Methods
# -------------
@classmethod
def set_nlp(cls, nlp: Language):
"""Sets the Entity Class' Spacy's pipeline
Args:
nlp (Language): pipeline
"""
cls.NLP = nlp
@classmethod
def tokenize(cls, text: str, disable: List[str] = ["parser", "negex"]) -> Doc:
"""Tokenizes a text fragment with the configured Spacy's pipeline
Args:
text (str): text fragment
disable (List[str], optional): pipes of the Spacy's pipeline to be disabled. Defaults to ["parser"].
Returns:
Doc: tokenized text
"""
if cls.NLP is None:
cls.NLP = get_pipeline()
with cls.NLP.select_pipes(disable=disable):
doc = cls.NLP(text)
return doc
@classmethod
def from_n2c2_annotation(cls, doc_id: str, annotation: str) -> "Entity":
"""Creates an Entity instance from an n2c2 annotation line
Args:
doc_id (str): the identifier of the document the entity belongs to
annotation (str): the entity description in the n2c2 corpus' format
Returns:
Entity: the annotated entity
"""
id, definition, text = annotation.strip().split("\t")
definition = definition.split() # definition: entity type and location in text
type = definition[0]
start = int(definition[1])
end = int(definition[-1])
return cls(id, text, type, doc_id, start, end)
@classmethod
def from_ddi_annotation(cls, doc_id: str, annotation: dict) -> "Entity":
"""Creates an Entity instance from an ddi xml annotation
Args:
doc_id (str): the identifier of the document the entity belongs to
annotation (dict): the entity description in the DDi Extraction Corpus' format
Returns:
Entity: the annotated entity
"""
id = annotation["id"]
type = annotation["type"].upper()
text = annotation["text"]
char_offset = annotation["charOffset"].split("-")
start = int(char_offset[0])
end = int(char_offset[-1]) + 1
return cls(id, text, type, doc_id, start, end)
# Instance methods
# ----------------
def todict(self) -> dict:
"""Dict representation of an entity
Returns:
dict: representation of the Entity
"""
return {
"id": self.id,
"type": self.type,
"text": self.text,
"doc_id": self.doc_id,
"start": self.start,
"end": self.end,
}