|
a |
|
b/src/nlp_pipeline.py |
|
|
1 |
# Base Dependencies |
|
|
2 |
# ----------------- |
|
|
3 |
import logging |
|
|
4 |
from typing import List |
|
|
5 |
|
|
|
6 |
# Spacy Dependencies |
|
|
7 |
# ------------------ |
|
|
8 |
from negspacy.negation import Negex |
|
|
9 |
from spacy import load as spacy_load |
|
|
10 |
from spacy.language import Language |
|
|
11 |
from spacy.tokens import Doc, Span |
|
|
12 |
|
|
|
13 |
# Constants |
|
|
14 |
# --------- |
|
|
15 |
from constants import N2C2_ENTITY_TYPES, DDI_ENTITY_TYPES |
|
|
16 |
|
|
|
17 |
|
|
|
18 |
# Spacy's pipeline |
|
|
19 |
NLP: Language = None |
|
|
20 |
|
|
|
21 |
|
|
|
22 |
# Auxiliar functions |
|
|
23 |
# ------------------ |
|
|
24 |
def get_pipeline() -> Language: |
|
|
25 |
"""Gets Spacy's pipeline, loading it if necessary. |
|
|
26 |
|
|
|
27 |
Returns: |
|
|
28 |
Language: Spacy's pipeline singleton |
|
|
29 |
""" |
|
|
30 |
global NLP |
|
|
31 |
|
|
|
32 |
# load only once |
|
|
33 |
if NLP is None: |
|
|
34 |
logging.warning("Loading Spacy's pipeline...") |
|
|
35 |
|
|
|
36 |
# load Scispacy's pipeline |
|
|
37 |
NLP = spacy_load("en_core_sci_sm", exclude=["ner"]) |
|
|
38 |
|
|
|
39 |
# add negation detection component |
|
|
40 |
ent_types = [t.upper() for t in N2C2_ENTITY_TYPES] + [ |
|
|
41 |
t.upper() for t in DDI_ENTITY_TYPES |
|
|
42 |
] |
|
|
43 |
NLP.add_pipe("negex", config={"ent_types": ent_types}) |
|
|
44 |
|
|
|
45 |
logging.warning("Spacy loaded!") |
|
|
46 |
return NLP |
|
|
47 |
|
|
|
48 |
|
|
|
49 |
def set_spacy_entities( |
|
|
50 |
relation: Doc, |
|
|
51 |
left_tokens: Doc, |
|
|
52 |
entity1_tokens: Doc, |
|
|
53 |
entity1_type: str, |
|
|
54 |
middle_tokens: Doc, |
|
|
55 |
entity2_tokens: Doc, |
|
|
56 |
entity2_type: str, |
|
|
57 |
right_tokens: Doc, |
|
|
58 |
) -> List[Span]: |
|
|
59 |
"""_summary_ |
|
|
60 |
|
|
|
61 |
Args: |
|
|
62 |
relation (Doc): _description_ |
|
|
63 |
left_tokens (Doc): _description_ |
|
|
64 |
entity1_tokens (Doc): _description_ |
|
|
65 |
entity1_type (str): _description_ |
|
|
66 |
middle_tokens (Doc): _description_ |
|
|
67 |
entity2_tokens (Doc): _description_ |
|
|
68 |
entity2_type (str): _description_ |
|
|
69 |
right_tokens (Doc): _description_ |
|
|
70 |
|
|
|
71 |
Returns: |
|
|
72 |
List[Span]: _description_ |
|
|
73 |
""" |
|
|
74 |
|
|
|
75 |
begin_e1 = len(left_tokens) |
|
|
76 |
end_e1 = begin_e1 + len(entity1_tokens) |
|
|
77 |
|
|
|
78 |
begin_e2 = end_e1 + len(middle_tokens) |
|
|
79 |
end_e2 = begin_e2 + len(entity2_tokens) |
|
|
80 |
|
|
|
81 |
e1 = Span(relation, begin_e1, end_e1, label=entity1_type) |
|
|
82 |
e2 = Span(relation, begin_e2, end_e2, label=entity2_type) |
|
|
83 |
|
|
|
84 |
relation.ents = [e1, e2] |