--- a +++ b/wrapper_functions/scispacy_functions.py @@ -0,0 +1,111 @@ +import spacy +from scispacy.abbreviation import AbbreviationDetector +from scispacy.hyponym_detector import HyponymDetector +from scispacy.linking import EntityLinker +from negspacy.negation import Negex + +def get_abbreviations(model, text): + """ + returns a list of tuples in the form (abbreviation, expanded form), each element being a str + """ + + # logging + print(f"Identifying abbrevations using {model}") + partial_input = '\n'.join(text.split('\n')[:5]) + print(f"Input text (truncated): {partial_input}\n...") + + # abbreviation detection with scispacy + nlp = spacy.load(model) + nlp.add_pipe("abbreviation_detector") + doc = nlp(text) + abbreviations = [(abrv.text, abrv._.long_form.text) for abrv in doc._.abbreviations] + + return abbreviations + +def get_hyponyms(model, text): + """ + returns a list of tuples in the form (hearst_pattern, entity_1, entity_2, ...), each element being a str + """ + + # logging + print(f"Extracting hyponyms using {model}") + partial_input = '\n'.join(text.split('\n')[:5]) + print(f"Input text (truncated): {partial_input}\n...") + + # hyponym detection with scispacy + nlp = spacy.load(model) + nlp.add_pipe("hyponym_detector", last=True, config={"extended": True}) + doc = nlp(text) + hearst_patterns = [tuple([str(element) for element in pattern]) for pattern in doc._.hearst_patterns] + + return hearst_patterns + +def get_linked_entities(model, text): + """ + returns a dictionary in the form {named entity: list of strings each describing one piece of linked information} + """ + + # logging + print(f"Entity linking using {model}") + partial_input = '\n'.join(text.split('\n')[:5]) + print(f"Input text (truncated): {partial_input}\n...") + + # entity linking with scispacy + output = {} + + nlp = spacy.load(model) + nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"}) + doc = nlp(text) + + ents = doc.ents + linker = nlp.get_pipe("scispacy_linker") + + for entity in ents: + cur = [] + for umls_ent in entity._.kb_ents: + cur.append(str(linker.kb.cui_to_entity[umls_ent[0]])) + output[entity] = cur + + return output + +def get_named_entities(model, text): + """ + returns a list of strings, each string is an identified named entity + """ + + # logging + print(f"Extracting named entities using {model}") + partial_input = '\n'.join(text.split('\n')[:5]) + print(f"Input text (truncated): {partial_input}\n...") + + # named recognition with scispacy + nlp = spacy.load(model) + doc = nlp(text) + named_entities = [str(ent) for ent in doc.ents] + + return named_entities + + +def get_negation_entities(model, text): + """ + returns a list of pairs, default model is "en_core_web_sm" + Negspacy is a spaCy pipeline component that evaluates whether Named Entities are negated in text. + Example: + >> test = get_negation_entities("en_core_web_sm","She does not like Steve Jobs but likes Apple products.") + >> print (test) + [(True, 'Steve Jobs'), (False, 'Apple')] + """ + + # logging + print(f"Extracting whether Named Entities are negated using {model}") + partial_input = '\n'.join(text.split('\n')[:5]) + print(f"Input text (truncated): {partial_input}\n...") + + # named recognition with scispacy + nlp = spacy.load(model) + nlp.add_pipe("negex", config={"ent_types":["PERSON","ORG","NORP","GPE"]}) + doc = nlp(text) + pairs = [(ent._.negex,ent.text) for ent in doc.ents] + + return pairs +