a b/wrapper_functions/scispacy_functions.py
1
import spacy
2
from scispacy.abbreviation import AbbreviationDetector
3
from scispacy.hyponym_detector import HyponymDetector
4
from scispacy.linking import EntityLinker
5
from negspacy.negation import Negex
6
7
def get_abbreviations(model, text):
8
    """
9
    returns a list of tuples in the form (abbreviation, expanded form), each element being a str
10
    """
11
12
    # logging
13
    print(f"Identifying abbrevations using {model}")
14
    partial_input = '\n'.join(text.split('\n')[:5])
15
    print(f"Input text (truncated): {partial_input}\n...")
16
17
    # abbreviation detection with scispacy
18
    nlp = spacy.load(model)
19
    nlp.add_pipe("abbreviation_detector")
20
    doc = nlp(text)
21
    abbreviations = [(abrv.text, abrv._.long_form.text) for abrv in doc._.abbreviations]
22
23
    return abbreviations
24
25
def get_hyponyms(model, text):
26
    """
27
    returns a list of tuples in the form (hearst_pattern, entity_1, entity_2, ...), each element being a str
28
    """
29
30
    # logging
31
    print(f"Extracting hyponyms using {model}")
32
    partial_input = '\n'.join(text.split('\n')[:5])
33
    print(f"Input text (truncated): {partial_input}\n...")
34
35
    # hyponym detection with scispacy
36
    nlp = spacy.load(model)
37
    nlp.add_pipe("hyponym_detector", last=True, config={"extended": True})
38
    doc = nlp(text)
39
    hearst_patterns = [tuple([str(element) for element in pattern]) for pattern in doc._.hearst_patterns]
40
41
    return hearst_patterns
42
43
def get_linked_entities(model, text):
44
    """
45
    returns a dictionary in the form {named entity: list of strings each describing one piece of linked information}
46
    """
47
48
    # logging
49
    print(f"Entity linking using {model}")
50
    partial_input = '\n'.join(text.split('\n')[:5])
51
    print(f"Input text (truncated): {partial_input}\n...")
52
53
    # entity linking with scispacy
54
    output = {}
55
56
    nlp = spacy.load(model)
57
    nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})
58
    doc = nlp(text)
59
60
    ents = doc.ents
61
    linker = nlp.get_pipe("scispacy_linker")
62
63
    for entity in ents:
64
        cur = []
65
        for umls_ent in entity._.kb_ents:
66
            cur.append(str(linker.kb.cui_to_entity[umls_ent[0]]))
67
        output[entity] = cur
68
69
    return output
70
71
def get_named_entities(model, text):
72
    """
73
    returns a list of strings, each string is an identified named entity
74
    """
75
76
    # logging
77
    print(f"Extracting named entities using {model}")
78
    partial_input = '\n'.join(text.split('\n')[:5])
79
    print(f"Input text (truncated): {partial_input}\n...")
80
81
    # named recognition with scispacy
82
    nlp = spacy.load(model)
83
    doc = nlp(text)
84
    named_entities = [str(ent) for ent in doc.ents]
85
86
    return named_entities
87
88
89
def get_negation_entities(model, text):
90
    """
91
    returns a list of pairs, default model is "en_core_web_sm"
92
    Negspacy is a spaCy pipeline component that evaluates whether Named Entities are negated in text.
93
    Example:
94
    >> test = get_negation_entities("en_core_web_sm","She does not like Steve Jobs but likes Apple products.")
95
    >> print (test)
96
    [(True, 'Steve Jobs'), (False, 'Apple')]
97
    """
98
99
    # logging
100
    print(f"Extracting whether Named Entities are negated using {model}")
101
    partial_input = '\n'.join(text.split('\n')[:5])
102
    print(f"Input text (truncated): {partial_input}\n...")
103
104
    # named recognition with scispacy
105
    nlp = spacy.load(model)
106
    nlp.add_pipe("negex", config={"ent_types":["PERSON","ORG","NORP","GPE"]})
107
    doc = nlp(text)
108
    pairs = [(ent._.negex,ent.text) for ent in doc.ents]
109
110
    return pairs
111