--- a +++ b/wrapper_functions/stanza_functions.py @@ -0,0 +1,75 @@ +import stanza + +def get_named_entities_stanza_biomed(text): + """ + returns a list of tuples in the form (named entity, type), each being a str + """ + stanza.download('en', package='mimic', processors={'ner': 'i2b2'}) + nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'}) + doc = nlp(text) + + named_entities = [(ent.text, ent.type) for ent in doc.entities] + + return named_entities + +def get_sents_stanza_biomed(text): + stanza.download('en', package='craft') + nlp = stanza.Pipeline('en', package='craft') + doc = nlp(text) + + sents = [sentence.text for sentence in doc.sentences] + + return sents + +def get_tokens_stanza_biomed(text): + stanza.download('en', package='craft') + nlp = stanza.Pipeline('en', package='craft') + doc = nlp(text) + + tokens = [[token.text for token in sentence.tokens] for sentence in doc.sentences] + + return tokens + +def get_part_of_speech_and_morphological_features(text): + """ + returns a list of lists of tuples of length 4: word, universal POS (UPOS) tags, treebank-specific POS (XPOS) tags, + and universal morphological features (UFeats) + """ + stanza.download('en', package='craft') + nlp = stanza.Pipeline('en', package='craft') + doc = nlp(text) + + tags = [[(word.text, word.upos, word.xpos, word.feats if word.feats else '_') + for word in sent.words] for sent in doc.sentences] + + return tags + +def get_lemmas_stanza_biomed(text): + stanza.download('en', package='craft') + nlp = stanza.Pipeline('en', package='craft') + doc = nlp(text) + + lemmas = [[(word.text, word.lemma) for word in sent.words] for sent in doc.sentences] + + return lemmas + +def get_dependency_stanza_biomed(text): + """ + tuple of length 5: word id, word text, head id, head text, deprel + """ + + stanza.download('en', package='craft') + nlp = stanza.Pipeline('en', package='craft') + doc = nlp(text) + + dependencies = [[(word.id, word.text, word.head, sent.words[word.head-1].text if word.head > 0 else "root", word.deprel) + for word in sent.words] for sent in doc.sentences ] + + return dependencies + +def get_denpendencies(text): + stanza.download('en', package='craft') + nlp = stanza.Pipeline('en', package='craft') + doc = nlp(text) + dependencies = [sent.print_dependencies() for sent in doc.sentences] + return dependencies \ No newline at end of file