[7fc5df]: / tests / tokenizer / test_tokenizer_ons.py

Download this file

56 lines (40 with data), 1.7 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from deidentify.tokenizer.tokenizer_ons import TokenizerOns
tokenizer = TokenizerOns()
def test_tokenizer():
text = '=== Answer: 1234 ===\ntest a b c d.\n=== Report: 1234 ===\nMw. test test test'
doc = tokenizer.parse_text(text)
tokens = [t.text for t in doc]
assert tokens == [
'=== Answer: 1234 ===\n', 'test', 'a', 'b', 'c', 'd.', '\n', '=== Report: 1234 ===\n',
'Mw.', 'test', 'test', 'test'
]
def test_sentence_segmentation():
text = '=== Answer: 1234 ===\nDit is een zin.\n=== Report: 1234 ===\nMw. heeft goed gegeten.'
doc = tokenizer.parse_text(text)
sents = [sent.text for sent in doc.sents]
assert sents == [
'=== Answer: 1234 ===\n',
'Dit is een zin.\n',
'=== Report: 1234 ===\n',
'Mw. heeft goed gegeten.'
]
sents = list(doc.sents)
assert [token.text for token in sents[0]] == ['=== Answer: 1234 ===\n']
assert [token.text for token in sents[1]] == ['Dit', 'is', 'een', 'zin', '.', '\n']
assert [token.text for token in sents[2]] == ['=== Report: 1234 ===\n']
assert [token.text for token in sents[3]] == ['Mw.', 'heeft', 'goed', 'gegeten', '.']
def test_infix_split_on_parenthesis():
text = 'GRZ(12-12-2020).'
doc = tokenizer.parse_text(text)
tokens = [t.text for t in doc]
assert tokens == 'GRZ ( 12-12-2020 ) .'.split()
def test_infix_split_on_forward_slash():
text = 'Groot/Kempers'
doc = tokenizer.parse_text(text)
tokens = [t.text for t in doc]
assert tokens == 'Groot / Kempers'.split()
def test_infix_split_on_forward_slash_exclude_dates():
text = '13/01/2020'
doc = tokenizer.parse_text(text)
tokens = [t.text for t in doc]
assert tokens == ['13/01/2020']