a b/tests/unit/test_tokenizer.py
1
import docdeid as dd
2
import pytest
3
4
from deduce.tokenizer import DeduceTokenizer
5
6
7
@pytest.fixture
8
def tokens():
9
    return [
10
        dd.Token(text="Patient", start_char=0, end_char=7),
11
        dd.Token(text="was", start_char=8, end_char=11),
12
        dd.Token(text="eerder", start_char=12, end_char=18),
13
        dd.Token(text="opgenomen", start_char=19, end_char=28),
14
        dd.Token(text="(", start_char=29, end_char=30),
15
        dd.Token(text="vorig", start_char=30, end_char=35),
16
        dd.Token(text="jaar", start_char=36, end_char=40),
17
        dd.Token(text=")", start_char=40, end_char=41),
18
        dd.Token(text="alhier", start_char=42, end_char=48),
19
        dd.Token(text=".", start_char=48, end_char=49),
20
    ]
21
22
23
class TestTokenizer:
24
    def test_split_alpha(self):
25
        tokenizer = DeduceTokenizer()
26
        text = "Pieter van der Zee"
27
        expected_tokens = [
28
            dd.Token(text="Pieter", start_char=0, end_char=6),
29
            dd.Token(text="van", start_char=7, end_char=10),
30
            dd.Token(text="der", start_char=11, end_char=14),
31
            dd.Token(text="Zee", start_char=15, end_char=18),
32
        ]
33
34
        assert tokenizer._split_text(text=text) == expected_tokens
35
36
    def test_split_nonalpha(self):
37
        tokenizer = DeduceTokenizer()
38
        text = "prematuur (<p3)"
39
40
        expected_tokens = [
41
            dd.Token(text="prematuur", start_char=0, end_char=9),
42
            dd.Token(text="(", start_char=10, end_char=11),
43
            dd.Token(text="<", start_char=11, end_char=12),
44
            dd.Token(text="p3", start_char=12, end_char=14),
45
            dd.Token(text=")", start_char=14, end_char=15),
46
        ]
47
48
        assert tokenizer._split_text(text=text) == expected_tokens
49
50
    def test_split_multiple_spaces(self):
51
        tokenizer = DeduceTokenizer()
52
        text = "Pieter van der Zee     Bergen Op  Zoom"
53
        expected_tokens = [
54
            dd.Token(text="Pieter", start_char=0, end_char=6),
55
            dd.Token(text="van", start_char=7, end_char=10),
56
            dd.Token(text="der", start_char=11, end_char=14),
57
            dd.Token(text="Zee", start_char=15, end_char=18),
58
            dd.Token(text="     ", start_char=18, end_char=23),
59
            dd.Token(text="Bergen", start_char=23, end_char=29),
60
            dd.Token(text="Op", start_char=30, end_char=32),
61
            dd.Token(text="Zoom", start_char=34, end_char=38),
62
        ]
63
64
        assert tokenizer._split_text(text=text) == expected_tokens
65
66
    def test_split_newline(self):
67
        tokenizer = DeduceTokenizer()
68
        text = "regel 1 \n gevolgd door regel 2"
69
70
        expected_tokens = [
71
            dd.Token(text="regel", start_char=0, end_char=5),
72
            dd.Token(text="1", start_char=6, end_char=7),
73
            dd.Token(text="\n", start_char=8, end_char=9),
74
            dd.Token(text="gevolgd", start_char=10, end_char=17),
75
            dd.Token(text="door", start_char=18, end_char=22),
76
            dd.Token(text="regel", start_char=23, end_char=28),
77
            dd.Token(text="2", start_char=29, end_char=30),
78
        ]
79
80
        assert tokenizer._split_text(text=text) == expected_tokens
81
82
    def test_join_tokens(self, tokens):
83
        text = "Patient was eerder opgenomen"
84
        joined_token = DeduceTokenizer()._join_tokens(text, tokens[0:4])
85
        expected_token = dd.Token(text=text, start_char=0, end_char=28)
86
87
        assert joined_token == expected_token
88
89
    def test_split_with_merge(self):
90
        tokenizer = DeduceTokenizer(merge_terms=["van der"])
91
        text = "Pieter van der Zee"
92
        expected_tokens = [
93
            dd.Token(text="Pieter", start_char=0, end_char=6),
94
            dd.Token(text="van der", start_char=7, end_char=14),
95
            dd.Token(text="Zee", start_char=15, end_char=18),
96
        ]
97
98
        assert tokenizer._split_text(text=text) == expected_tokens