EHRKit-2022 / Git / Diff of /wrapper_functions/stanza

Models:
philipB/
EHRKit-2022
Downloads: 1
Diff of /wrapper_functions/stanza_functions.py [000000] .. [2d4573]
Switch to side-by-side view

--- a
+++ b/wrapper_functions/stanza_functions.py
@@ -0,0 +1,75 @@
+import stanza
+
+def get_named_entities_stanza_biomed(text):
+    """
+    returns a list of tuples in the form (named entity, type), each being a str
+    """
+    stanza.download('en', package='mimic', processors={'ner': 'i2b2'})
+    nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'})
+    doc = nlp(text)
+
+    named_entities = [(ent.text, ent.type) for ent in  doc.entities]
+
+    return named_entities
+
+def get_sents_stanza_biomed(text):
+    stanza.download('en', package='craft')
+    nlp = stanza.Pipeline('en', package='craft')
+    doc = nlp(text)
+
+    sents = [sentence.text for sentence in doc.sentences]
+
+    return sents
+
+def get_tokens_stanza_biomed(text):
+    stanza.download('en', package='craft')
+    nlp = stanza.Pipeline('en', package='craft')
+    doc = nlp(text)
+
+    tokens = [[token.text for token in sentence.tokens] for sentence in doc.sentences]
+
+    return tokens
+
+def get_part_of_speech_and_morphological_features(text):
+    """
+    returns a list of lists of tuples of length 4: word, universal POS (UPOS) tags, treebank-specific POS (XPOS) tags,
+    and universal morphological features (UFeats)
+    """
+    stanza.download('en', package='craft')
+    nlp = stanza.Pipeline('en', package='craft')
+    doc = nlp(text)
+
+    tags = [[(word.text, word.upos, word.xpos, word.feats if word.feats else '_')
+             for word in sent.words] for sent in doc.sentences]
+
+    return tags
+
+def get_lemmas_stanza_biomed(text):
+    stanza.download('en', package='craft')
+    nlp = stanza.Pipeline('en', package='craft')
+    doc = nlp(text)
+
+    lemmas = [[(word.text, word.lemma) for word in sent.words] for sent in doc.sentences]
+
+    return lemmas
+
+def get_dependency_stanza_biomed(text):
+    """
+    tuple of length 5:  word id, word text, head id, head text, deprel
+    """
+
+    stanza.download('en', package='craft')
+    nlp = stanza.Pipeline('en', package='craft')
+    doc = nlp(text)
+
+    dependencies = [[(word.id, word.text, word.head, sent.words[word.head-1].text if word.head > 0 else "root", word.deprel)
+                     for word in sent.words] for sent in doc.sentences ]
+
+    return dependencies
+
+def get_denpendencies(text):
+    stanza.download('en', package='craft')
+    nlp = stanza.Pipeline('en', package='craft')
+    doc = nlp(text)
+    dependencies = [sent.print_dependencies() for sent in doc.sentences]
+    return dependencies
\ No newline at end of file