|
a |
|
b/wrapper_functions/utils.py |
|
|
1 |
from PyRuSH import RuSH |
|
|
2 |
import stanza |
|
|
3 |
import scispacy |
|
|
4 |
import spacy |
|
|
5 |
|
|
|
6 |
def get_sents_pyrush(text): |
|
|
7 |
print("Segment into sentences using PyRuSH") |
|
|
8 |
rush = RuSH('conf/rush_rules.tsv') |
|
|
9 |
sentences = rush.segToSentenceSpans(text) |
|
|
10 |
return sentences |
|
|
11 |
|
|
|
12 |
def get_sents_stanza(text): |
|
|
13 |
stanza.download('en') |
|
|
14 |
nlp = stanza.Pipeline(lang='en', processors='tokenize') |
|
|
15 |
sentences = [sentence.text for sentence in nlp(text).sentences] |
|
|
16 |
return sentences |
|
|
17 |
|
|
|
18 |
def get_multiple_sents_stanza(texts): |
|
|
19 |
stanza.download('en') |
|
|
20 |
nlp = stanza.Pipeline(lang='en', processors='tokenize') |
|
|
21 |
sentences = [[sentence.text for sentence in nlp(text).sentences] for text in texts] |
|
|
22 |
return sentences |
|
|
23 |
|
|
|
24 |
def get_sents_scispacy(text): |
|
|
25 |
nlp = spacy.load("en_core_sci_sm") |
|
|
26 |
doc = nlp(text) |
|
|
27 |
sentences = [sentence.text for sentence in doc.sents] |
|
|
28 |
return sentences |