al-medical-RE / Git / Diff of /src/models/entity.py

Models:
philipB/
al-medical-RE
Downloads: 1
Diff of /src/models/entity.py [000000] .. [735bb5]
Switch to side-by-side view

--- a
+++ b/src/models/entity.py
@@ -0,0 +1,153 @@
+# Base Dependencies
+# -----------------
+from typing import List
+
+# Local Dependencies 
+# ------------------
+from nlp_pipeline import get_pipeline
+
+# 3rd-Party Dependencies
+# ----------------------
+from spacy.language import Language
+from spacy.tokens import Doc
+
+
+class Entity:
+    """
+    Entity
+
+    Representation of a biomedical entity.
+    """
+
+    # Spacy's pipeline
+    NLP: Language = None
+
+    def __init__(
+        self, id: str, text: str, type: str, doc_id: str, start: int, end: int
+    ):
+        """
+        Args:
+            id (str): identifier
+            text (str): text
+            type (str): entity type
+            doc_id (str): the identifier of the document the entity belongs to
+            start (int): start character in the sentence
+            end (int): end character in the sentence
+        """
+        self.id = id
+        self.type = type.strip()
+        self.text = text.strip()
+        self.doc_id = doc_id
+        self.start = start
+        self.end = end
+        self._tokens = None
+
+    def __len__(self) -> int:
+        return len(self.text)
+
+    def __str__(self) -> str:
+        return "Entity(id: {}, type: {}, text: {}, start: {}, end: {})".format(
+            self.id, self.type, self.text, self.start, self.end
+        )
+
+    @property
+    def uid(self) -> str:
+        """Unique idenfitifer"""
+        return "{}-{}".format(self.doc_id, self.id)
+
+    @property
+    def tokens(self) -> Doc:
+        """Obtains the tokenized text of the entity's text
+
+        Returns:
+            Doc: processed text through Spacy's pipeline
+        """
+        if self._tokens is None:
+            self._tokens = Entity.tokenize(self.text)
+        return self._tokens
+
+    # Class Methods
+    # -------------
+    @classmethod
+    def set_nlp(cls, nlp: Language):
+        """Sets the Entity Class' Spacy's pipeline
+
+        Args:
+            nlp (Language): pipeline
+        """
+        cls.NLP = nlp
+
+    @classmethod
+    def tokenize(cls, text: str, disable: List[str] = ["parser", "negex"]) -> Doc:
+        """Tokenizes a text fragment with the configured Spacy's pipeline
+
+        Args:
+            text (str): text fragment
+            disable (List[str], optional): pipes of the Spacy's pipeline to be disabled. Defaults to ["parser"].
+
+        Returns:
+            Doc: tokenized text
+        """
+        if cls.NLP is None:
+            cls.NLP = get_pipeline()
+
+        with cls.NLP.select_pipes(disable=disable):
+            doc = cls.NLP(text)
+        return doc
+
+    @classmethod
+    def from_n2c2_annotation(cls, doc_id: str, annotation: str) -> "Entity":
+        """Creates an Entity instance from an n2c2 annotation line
+
+        Args:
+            doc_id (str): the identifier of the document the entity belongs to
+            annotation (str): the entity description in the n2c2 corpus' format
+
+        Returns:
+            Entity: the annotated entity
+
+        """
+        id, definition, text = annotation.strip().split("\t")
+        definition = definition.split()  # definition: entity type and location in text
+        type = definition[0]
+        start = int(definition[1])
+        end = int(definition[-1])
+
+        return cls(id, text, type, doc_id, start, end)
+
+    @classmethod
+    def from_ddi_annotation(cls, doc_id: str, annotation: dict) -> "Entity":
+        """Creates an Entity instance from an ddi xml annotation
+
+        Args:
+            doc_id (str): the identifier of the document the entity belongs to
+            annotation (dict): the entity description in the DDi Extraction Corpus' format
+
+        Returns:
+            Entity: the annotated entity
+        """
+        id = annotation["id"]
+        type = annotation["type"].upper()
+        text = annotation["text"]
+        char_offset = annotation["charOffset"].split("-")
+        start = int(char_offset[0])
+        end = int(char_offset[-1]) + 1
+
+        return cls(id, text, type, doc_id, start, end)
+
+    # Instance methods
+    # ----------------
+    def todict(self) -> dict:
+        """Dict representation of an entity
+
+        Returns:
+            dict: representation of the Entity
+        """
+        return {
+            "id": self.id,
+            "type": self.type,
+            "text": self.text,
+            "doc_id": self.doc_id,
+            "start": self.start,
+            "end": self.end,
+        }