--- a +++ b/src/models/entity.py @@ -0,0 +1,153 @@ +# Base Dependencies +# ----------------- +from typing import List + +# Local Dependencies +# ------------------ +from nlp_pipeline import get_pipeline + +# 3rd-Party Dependencies +# ---------------------- +from spacy.language import Language +from spacy.tokens import Doc + + +class Entity: + """ + Entity + + Representation of a biomedical entity. + """ + + # Spacy's pipeline + NLP: Language = None + + def __init__( + self, id: str, text: str, type: str, doc_id: str, start: int, end: int + ): + """ + Args: + id (str): identifier + text (str): text + type (str): entity type + doc_id (str): the identifier of the document the entity belongs to + start (int): start character in the sentence + end (int): end character in the sentence + """ + self.id = id + self.type = type.strip() + self.text = text.strip() + self.doc_id = doc_id + self.start = start + self.end = end + self._tokens = None + + def __len__(self) -> int: + return len(self.text) + + def __str__(self) -> str: + return "Entity(id: {}, type: {}, text: {}, start: {}, end: {})".format( + self.id, self.type, self.text, self.start, self.end + ) + + @property + def uid(self) -> str: + """Unique idenfitifer""" + return "{}-{}".format(self.doc_id, self.id) + + @property + def tokens(self) -> Doc: + """Obtains the tokenized text of the entity's text + + Returns: + Doc: processed text through Spacy's pipeline + """ + if self._tokens is None: + self._tokens = Entity.tokenize(self.text) + return self._tokens + + # Class Methods + # ------------- + @classmethod + def set_nlp(cls, nlp: Language): + """Sets the Entity Class' Spacy's pipeline + + Args: + nlp (Language): pipeline + """ + cls.NLP = nlp + + @classmethod + def tokenize(cls, text: str, disable: List[str] = ["parser", "negex"]) -> Doc: + """Tokenizes a text fragment with the configured Spacy's pipeline + + Args: + text (str): text fragment + disable (List[str], optional): pipes of the Spacy's pipeline to be disabled. Defaults to ["parser"]. + + Returns: + Doc: tokenized text + """ + if cls.NLP is None: + cls.NLP = get_pipeline() + + with cls.NLP.select_pipes(disable=disable): + doc = cls.NLP(text) + return doc + + @classmethod + def from_n2c2_annotation(cls, doc_id: str, annotation: str) -> "Entity": + """Creates an Entity instance from an n2c2 annotation line + + Args: + doc_id (str): the identifier of the document the entity belongs to + annotation (str): the entity description in the n2c2 corpus' format + + Returns: + Entity: the annotated entity + + """ + id, definition, text = annotation.strip().split("\t") + definition = definition.split() # definition: entity type and location in text + type = definition[0] + start = int(definition[1]) + end = int(definition[-1]) + + return cls(id, text, type, doc_id, start, end) + + @classmethod + def from_ddi_annotation(cls, doc_id: str, annotation: dict) -> "Entity": + """Creates an Entity instance from an ddi xml annotation + + Args: + doc_id (str): the identifier of the document the entity belongs to + annotation (dict): the entity description in the DDi Extraction Corpus' format + + Returns: + Entity: the annotated entity + """ + id = annotation["id"] + type = annotation["type"].upper() + text = annotation["text"] + char_offset = annotation["charOffset"].split("-") + start = int(char_offset[0]) + end = int(char_offset[-1]) + 1 + + return cls(id, text, type, doc_id, start, end) + + # Instance methods + # ---------------- + def todict(self) -> dict: + """Dict representation of an entity + + Returns: + dict: representation of the Entity + """ + return { + "id": self.id, + "type": self.type, + "text": self.text, + "doc_id": self.doc_id, + "start": self.start, + "end": self.end, + }