Switch to unified view

a b/tests/unit/test_annotator.py
1
import re
2
from unittest.mock import patch
3
4
import docdeid as dd
5
import pytest
6
7
from deduce.annotator import (
8
    BsnAnnotator,
9
    ContextAnnotator,
10
    PatientNameAnnotator,
11
    PhoneNumberAnnotator,
12
    RegexpPseudoAnnotator,
13
    TokenPatternAnnotator,
14
    _PatternPositionMatcher,
15
)
16
from deduce.person import Person
17
from deduce.tokenizer import DeduceTokenizer
18
from tests.helpers import linked_tokens
19
20
21
@pytest.fixture
22
def ds():
23
    ds = dd.ds.DsCollection()
24
25
    first_names = ["Andries", "pieter", "Aziz", "Bernard"]
26
    surnames = ["Meijer", "Smit", "Bakker", "Heerma"]
27
28
    ds["first_names"] = dd.ds.LookupSet()
29
    ds["first_names"].add_items_from_iterable(items=first_names)
30
31
    ds["surnames"] = dd.ds.LookupSet()
32
    ds["surnames"].add_items_from_iterable(items=surnames)
33
34
    return ds
35
36
37
@pytest.fixture
38
def tokenizer():
39
    return DeduceTokenizer()
40
41
42
@pytest.fixture
43
def regexp_pseudo_doc(tokenizer):
44
45
    return dd.Document(
46
        text="De patient is Na 12 jaar gestopt met medicijnen.",
47
        tokenizers={"default": tokenizer},
48
    )
49
50
51
@pytest.fixture
52
def pattern_doc(tokenizer):
53
    return dd.Document(
54
        text="De man heet Andries Meijer-Heerma, voornaam Andries.",
55
        tokenizers={"default": tokenizer},
56
    )
57
58
59
@pytest.fixture
60
def bsn_doc():
61
    d = dd.DocDeid()
62
63
    return d.deidentify(
64
        text="Geldige voorbeelden zijn: 111222333 en 123456782. "
65
        "Patientnummer is 01234, en ander id 01234567890."
66
    )
67
68
69
@pytest.fixture
70
def phone_number_doc():
71
    d = dd.DocDeid()
72
73
    return d.deidentify(
74
        text="Telefoonnummers zijn 0314-555555, (088 755 55 55) of (06)55555555, "
75
        "maar 065555 is te kort en 065555555555 is te lang. "
76
        "Verwijsnummer is 0800-9003."
77
    )
78
79
80
@pytest.fixture
81
def surname_pattern():
82
    return linked_tokens(["Van der", "Heide", "-", "Ginkel"])
83
84
85
def token(text: str):
86
    return dd.Token(text=text, start_char=0, end_char=len(text))
87
88
89
class TestPositionMatcher:
90
    def test_equal(self):
91
        assert _PatternPositionMatcher.match({"equal": "test"}, token=token("test"))
92
        assert not _PatternPositionMatcher.match({"equal": "_"}, token=token("test"))
93
94
    def test_re_match(self):
95
        assert _PatternPositionMatcher.match({"re_match": "[a-z]"}, token=token("abc"))
96
        assert _PatternPositionMatcher.match(
97
            {"re_match": "[a-z]"}, token=token("abc123")
98
        )
99
        assert not _PatternPositionMatcher.match({"re_match": "[a-z]"}, token=token(""))
100
        assert not _PatternPositionMatcher.match(
101
            {"re_match": "[a-z]"}, token=token("123")
102
        )
103
        assert not _PatternPositionMatcher.match(
104
            {"re_match": "[a-z]"}, token=token("123abc")
105
        )
106
107
    def test_is_initials(self):
108
109
        assert _PatternPositionMatcher.match({"is_initials": True}, token=token("A"))
110
        assert _PatternPositionMatcher.match({"is_initials": True}, token=token("AB"))
111
        assert _PatternPositionMatcher.match({"is_initials": True}, token=token("ABC"))
112
        assert _PatternPositionMatcher.match({"is_initials": True}, token=token("ABCD"))
113
        assert not _PatternPositionMatcher.match(
114
            {"is_initials": True}, token=token("ABCDE")
115
        )
116
        assert not _PatternPositionMatcher.match({"is_initials": True}, token=token(""))
117
        assert not _PatternPositionMatcher.match(
118
            {"is_initials": True}, token=token("abcd")
119
        )
120
        assert not _PatternPositionMatcher.match(
121
            {"is_initials": True}, token=token("abcde")
122
        )
123
124
    def test_match_like_name(self):
125
        pattern_position = {"like_name": True}
126
127
        assert _PatternPositionMatcher.match(pattern_position, token=token("Diederik"))
128
        assert not _PatternPositionMatcher.match(pattern_position, token=token("Le"))
129
        assert not _PatternPositionMatcher.match(
130
            pattern_position, token=token("diederik")
131
        )
132
        assert not _PatternPositionMatcher.match(
133
            pattern_position, token=token("Diederik3")
134
        )
135
136
    def test_match_lookup(self, ds):
137
        assert _PatternPositionMatcher.match(
138
            {"lookup": "first_names"}, token=token("Andries"), ds=ds
139
        )
140
        assert not _PatternPositionMatcher.match(
141
            {"lookup": "first_names"}, token=token("andries"), ds=ds
142
        )
143
        assert not _PatternPositionMatcher.match(
144
            {"lookup": "surnames"}, token=token("Andries"), ds=ds
145
        )
146
        assert not _PatternPositionMatcher.match(
147
            {"lookup": "first_names"}, token=token("Smit"), ds=ds
148
        )
149
        assert _PatternPositionMatcher.match(
150
            {"lookup": "surnames"}, token=token("Smit"), ds=ds
151
        )
152
        assert not _PatternPositionMatcher.match(
153
            {"lookup": "surnames"}, token=token("smit"), ds=ds
154
        )
155
156
    def test_match_neg_lookup(self, ds):
157
        assert not _PatternPositionMatcher.match(
158
            {"neg_lookup": "first_names"}, token=token("Andries"), ds=ds
159
        )
160
        assert _PatternPositionMatcher.match(
161
            {"neg_lookup": "first_names"}, token=token("andries"), ds=ds
162
        )
163
        assert _PatternPositionMatcher.match(
164
            {"neg_lookup": "surnames"}, token=token("Andries"), ds=ds
165
        )
166
        assert _PatternPositionMatcher.match(
167
            {"neg_lookup": "first_names"}, token=token("Smit"), ds=ds
168
        )
169
        assert not _PatternPositionMatcher.match(
170
            {"neg_lookup": "surnames"}, token=token("Smit"), ds=ds
171
        )
172
        assert _PatternPositionMatcher.match(
173
            {"neg_lookup": "surnames"}, token=token("smit"), ds=ds
174
        )
175
176
    def test_match_and(self):
177
        assert _PatternPositionMatcher.match(
178
            {"and": [{"equal": "Abcd"}, {"like_name": True}]},
179
            token=token("Abcd"),
180
            ds=ds,
181
        )
182
        assert not _PatternPositionMatcher.match(
183
            {"and": [{"equal": "dcef"}, {"like_name": True}]},
184
            token=token("Abcd"),
185
            ds=ds,
186
        )
187
        assert not _PatternPositionMatcher.match(
188
            {"and": [{"equal": "A"}, {"like_name": True}]}, token=token("A"), ds=ds
189
        )
190
        assert not _PatternPositionMatcher.match(
191
            {"and": [{"equal": "b"}, {"like_name": True}]}, token=token("a"), ds=ds
192
        )
193
194
    def test_match_or(self):
195
        assert _PatternPositionMatcher.match(
196
            {"or": [{"equal": "Abcd"}, {"like_name": True}]}, token=token("Abcd"), ds=ds
197
        )
198
        assert _PatternPositionMatcher.match(
199
            {"or": [{"equal": "dcef"}, {"like_name": True}]}, token=token("Abcd"), ds=ds
200
        )
201
        assert _PatternPositionMatcher.match(
202
            {"or": [{"equal": "A"}, {"like_name": True}]}, token=token("A"), ds=ds
203
        )
204
        assert not _PatternPositionMatcher.match(
205
            {"or": [{"equal": "b"}, {"like_name": True}]}, token=token("a"), ds=ds
206
        )
207
208
209
class TestTokenPatternAnnotator:
210
    def test_match_sequence(self, pattern_doc, ds):
211
        pattern = [{"lookup": "first_names"}, {"like_name": True}]
212
213
        tpa = TokenPatternAnnotator(pattern=[{}], ds=ds, tag="_")
214
215
        assert tpa._match_sequence(
216
            pattern_doc.text, start_token=pattern_doc.get_tokens()[3], pattern=pattern
217
        ) == dd.Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_")
218
        assert (
219
            tpa._match_sequence(
220
                pattern_doc.text,
221
                start_token=pattern_doc.get_tokens()[7],
222
                pattern=pattern,
223
            )
224
            is None
225
        )
226
227
    def test_match_sequence_left(self, pattern_doc, ds):
228
        pattern = [{"lookup": "first_names"}, {"like_name": True}]
229
230
        tpa = TokenPatternAnnotator(pattern=[{}], ds=ds, tag="_")
231
232
        assert tpa._match_sequence(
233
            pattern_doc.text,
234
            start_token=pattern_doc.get_tokens()[4],
235
            pattern=pattern,
236
            direction="left",
237
        ) == dd.Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_")
238
239
        assert (
240
            tpa._match_sequence(
241
                pattern_doc.text,
242
                start_token=pattern_doc.get_tokens()[8],
243
                direction="left",
244
                pattern=pattern,
245
            )
246
            is None
247
        )
248
249
    def test_match_sequence_skip(self, pattern_doc, ds):
250
        pattern = [{"lookup": "surnames"}, {"like_name": True}]
251
252
        tpa = TokenPatternAnnotator(pattern=[{}], ds=ds, tag="_")
253
254
        assert tpa._match_sequence(
255
            pattern_doc.text,
256
            start_token=pattern_doc.get_tokens()[4],
257
            pattern=pattern,
258
            skip={"-"},
259
        ) == dd.Annotation(text="Meijer-Heerma", start_char=20, end_char=33, tag="_")
260
        assert (
261
            tpa._match_sequence(
262
                pattern_doc.text,
263
                start_token=pattern_doc.get_tokens()[4],
264
                pattern=pattern,
265
                skip=set(),
266
            )
267
            is None
268
        )
269
270
    def test_annotate(self, pattern_doc, ds):
271
        pattern = [{"lookup": "first_names"}, {"like_name": True}]
272
273
        tpa = TokenPatternAnnotator(pattern=pattern, ds=ds, tag="_")
274
275
        assert tpa.annotate(pattern_doc) == [
276
            dd.Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_")
277
        ]
278
279
280
class TestContextAnnotator:
281
    def test_apply_context_pattern(self, pattern_doc):
282
        annotator = ContextAnnotator(pattern=[])
283
284
        annotations = dd.AnnotationSet(
285
            [
286
                dd.Annotation(
287
                    text="Andries",
288
                    start_char=12,
289
                    end_char=19,
290
                    tag="voornaam",
291
                    start_token=pattern_doc.get_tokens()[3],
292
                    end_token=pattern_doc.get_tokens()[3],
293
                )
294
            ]
295
        )
296
297
        assert annotator._apply_context_pattern(
298
            pattern_doc.text,
299
            annotations,
300
            {
301
                "pattern": [{"like_name": True}],
302
                "direction": "right",
303
                "pre_tag": "voornaam",
304
                "tag": "{tag}+naam",
305
            },
306
        ) == dd.AnnotationSet(
307
            [
308
                dd.Annotation(
309
                    text="Andries Meijer",
310
                    start_char=12,
311
                    end_char=26,
312
                    tag="voornaam+naam",
313
                )
314
            ]
315
        )
316
317
    def test_apply_context_pattern_left(self, pattern_doc):
318
        annotator = ContextAnnotator(pattern=[])
319
320
        annotations = dd.AnnotationSet(
321
            [
322
                dd.Annotation(
323
                    text="Meijer",
324
                    start_char=20,
325
                    end_char=26,
326
                    tag="achternaam",
327
                    start_token=pattern_doc.get_tokens()[4],
328
                    end_token=pattern_doc.get_tokens()[4],
329
                )
330
            ]
331
        )
332
333
        assert annotator._apply_context_pattern(
334
            pattern_doc.text,
335
            annotations,
336
            {
337
                "pattern": [{"like_name": True}],
338
                "direction": "left",
339
                "pre_tag": "achternaam",
340
                "tag": "naam+{tag}",
341
            },
342
        ) == dd.AnnotationSet(
343
            [
344
                dd.Annotation(
345
                    text="Andries Meijer",
346
                    start_char=12,
347
                    end_char=26,
348
                    tag="naam+achternaam",
349
                )
350
            ]
351
        )
352
353
    def test_apply_context_pattern_skip(self, pattern_doc):
354
        annotator = ContextAnnotator(pattern=[])
355
356
        annotations = dd.AnnotationSet(
357
            [
358
                dd.Annotation(
359
                    text="Meijer",
360
                    start_char=20,
361
                    end_char=26,
362
                    tag="achternaam",
363
                    start_token=pattern_doc.get_tokens()[4],
364
                    end_token=pattern_doc.get_tokens()[4],
365
                )
366
            ]
367
        )
368
369
        assert annotator._apply_context_pattern(
370
            pattern_doc.text,
371
            annotations,
372
            {
373
                "pattern": [{"like_name": True}],
374
                "direction": "right",
375
                "skip": ["-"],
376
                "pre_tag": "achternaam",
377
                "tag": "{tag}+naam",
378
            },
379
        ) == dd.AnnotationSet(
380
            [
381
                dd.Annotation(
382
                    text="Meijer-Heerma",
383
                    start_char=20,
384
                    end_char=33,
385
                    tag="achternaam+naam",
386
                )
387
            ]
388
        )
389
390
    def test_annotate_multiple(self, pattern_doc):
391
        pattern = [
392
            {
393
                "pattern": [{"like_name": True}],
394
                "direction": "right",
395
                "pre_tag": "voornaam",
396
                "tag": "{tag}+naam",
397
            },
398
            {
399
                "pattern": [{"like_name": True}],
400
                "direction": "right",
401
                "skip": ["-"],
402
                "pre_tag": "achternaam",
403
                "tag": "{tag}+naam",
404
            },
405
        ]
406
407
        annotator = ContextAnnotator(pattern=pattern, iterative=False)
408
409
        annotations = dd.AnnotationSet(
410
            [
411
                dd.Annotation(
412
                    text="Andries",
413
                    start_char=12,
414
                    end_char=19,
415
                    tag="voornaam",
416
                    start_token=pattern_doc.get_tokens()[3],
417
                    end_token=pattern_doc.get_tokens()[3],
418
                )
419
            ]
420
        )
421
422
        assert annotator._annotate(pattern_doc.text, annotations) == dd.AnnotationSet(
423
            {
424
                dd.Annotation(
425
                    text="Andries Meijer-Heerma",
426
                    start_char=12,
427
                    end_char=33,
428
                    tag="voornaam+naam+naam",
429
                )
430
            }
431
        )
432
433
    def test_annotate_iterative(self, pattern_doc):
434
        pattern = [
435
            {
436
                "pattern": [{"like_name": True}],
437
                "direction": "right",
438
                "skip": ["-"],
439
                "pre_tag": ["naam", "voornaam"],
440
                "tag": "{tag}+naam",
441
            }
442
        ]
443
444
        annotator = ContextAnnotator(pattern=pattern, iterative=True)
445
446
        annotations = dd.AnnotationSet(
447
            [
448
                dd.Annotation(
449
                    text="Andries",
450
                    start_char=12,
451
                    end_char=19,
452
                    tag="voornaam",
453
                    start_token=pattern_doc.get_tokens()[3],
454
                    end_token=pattern_doc.get_tokens()[3],
455
                )
456
            ]
457
        )
458
459
        assert annotator._annotate(pattern_doc.text, annotations) == dd.AnnotationSet(
460
            {
461
                dd.Annotation(
462
                    text="Andries Meijer-Heerma",
463
                    start_char=12,
464
                    end_char=33,
465
                    tag="voornaam+naam+naam",
466
                )
467
            }
468
        )
469
470
471
class TestPatientNameAnnotator:
472
    def test_match_first_name_multiple(self, tokenizer):
473
474
        metadata = {"patient": Person(first_names=["Jan", "Adriaan"])}
475
        tokens = linked_tokens(["Jan", "Adriaan"])
476
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
477
        doc = dd.Document(text="_", metadata=metadata)
478
479
        assert ann._match_first_names(doc=doc, token=tokens[0]) == (
480
            tokens[0],
481
            tokens[0],
482
        )
483
484
        assert ann._match_first_names(doc=doc, token=tokens[1]) == (
485
            tokens[1],
486
            tokens[1],
487
        )
488
489
    def test_match_first_name_fuzzy(self, tokenizer):
490
491
        metadata = {"patient": Person(first_names=["Adriaan"])}
492
        tokens = linked_tokens(["Adriana"])
493
494
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
495
        doc = dd.Document(text="_", metadata=metadata)
496
497
        assert ann._match_first_names(doc=doc, token=tokens[0]) == (
498
            tokens[0],
499
            tokens[0],
500
        )
501
502
    def test_match_first_name_fuzzy_short(self, tokenizer):
503
504
        metadata = {"patient": Person(first_names=["Jan"])}
505
        tokens = linked_tokens(["Dan"])
506
507
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
508
        doc = dd.Document(text="_", metadata=metadata)
509
510
        assert ann._match_first_names(doc=doc, token=tokens[0]) is None
511
512
    def test_match_initial_from_name(self, tokenizer):
513
514
        metadata = {"patient": Person(first_names=["Jan", "Adriaan"])}
515
        tokens = linked_tokens(["A", "J"])
516
517
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
518
        doc = dd.Document(text="_", metadata=metadata)
519
520
        assert ann._match_initial_from_name(doc=doc, token=tokens[0]) == (
521
            tokens[0],
522
            tokens[0],
523
        )
524
525
        assert ann._match_initial_from_name(doc=doc, token=tokens[1]) == (
526
            tokens[1],
527
            tokens[1],
528
        )
529
530
    def test_match_initial_from_name_with_period(self, tokenizer):
531
532
        metadata = {"patient": Person(first_names=["Jan", "Adriaan"])}
533
        tokens = linked_tokens(["J", ".", "A", "."])
534
535
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
536
        doc = dd.Document(text="_", metadata=metadata)
537
538
        assert ann._match_initial_from_name(doc=doc, token=tokens[0]) == (
539
            tokens[0],
540
            tokens[1],
541
        )
542
543
        assert ann._match_initial_from_name(doc=doc, token=tokens[2]) == (
544
            tokens[2],
545
            tokens[3],
546
        )
547
548
    def test_match_initial_from_name_no_match(self, tokenizer):
549
550
        metadata = {"patient": Person(first_names=["Jan", "Adriaan"])}
551
        tokens = linked_tokens(["F", "T"])
552
553
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
554
        doc = dd.Document(text="_", metadata=metadata)
555
556
        assert ann._match_initial_from_name(doc=doc, token=tokens[0]) is None
557
        assert ann._match_initial_from_name(doc=doc, token=tokens[1]) is None
558
559
    def test_match_initials(self, tokenizer):
560
561
        metadata = {"patient": Person(initials="AFTH")}
562
        tokens = linked_tokens(["AFTH", "THFA"])
563
564
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
565
        doc = dd.Document(text="_", metadata=metadata)
566
567
        assert ann._match_initials(doc=doc, token=tokens[0]) == (tokens[0], tokens[0])
568
        assert ann._match_initials(doc=doc, token=tokens[1]) is None
569
570
    def test_match_surname_equal(self, tokenizer, surname_pattern):
571
572
        metadata = {"surname_pattern": surname_pattern}
573
        tokens = linked_tokens(["Van der", "Heide", "-", "Ginkel", "is", "de", "naam"])
574
575
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
576
        doc = dd.Document(text="_", metadata=metadata)
577
578
        with patch.object(tokenizer, "tokenize", return_value=surname_pattern):
579
580
            assert ann._match_surname(doc=doc, token=tokens[0]) == (
581
                tokens[0],
582
                tokens[3],
583
            )
584
585
    def test_match_surname_longer_than_tokens(self, tokenizer, surname_pattern):
586
587
        metadata = {"surname_pattern": surname_pattern}
588
        tokens = linked_tokens(["Van der", "Heide"])
589
590
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
591
        doc = dd.Document(text="_", metadata=metadata)
592
593
        with patch.object(tokenizer, "tokenize", return_value=surname_pattern):
594
595
            assert ann._match_surname(doc=doc, token=tokens[0]) is None
596
597
    def test_match_surname_fuzzy(self, tokenizer, surname_pattern):
598
599
        metadata = {"surname_pattern": surname_pattern}
600
        tokens = linked_tokens(["Van der", "Heijde", "-", "Ginkle", "is", "de", "naam"])
601
602
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
603
        doc = dd.Document(text="_", metadata=metadata)
604
605
        with patch.object(tokenizer, "tokenize", return_value=surname_pattern):
606
607
            assert ann._match_surname(doc=doc, token=tokens[0]) == (
608
                tokens[0],
609
                tokens[3],
610
            )
611
612
    def test_match_surname_unequal_first(self, tokenizer, surname_pattern):
613
614
        metadata = {"surname_pattern": surname_pattern}
615
        tokens = linked_tokens(["v/der", "Heide", "-", "Ginkel", "is", "de", "naam"])
616
617
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
618
        doc = dd.Document(text="_", metadata=metadata)
619
620
        with patch.object(tokenizer, "tokenize", return_value=surname_pattern):
621
622
            assert ann._match_surname(doc=doc, token=tokens[0]) is None
623
624
    def test_match_surname_unequal_first_fuzzy(self, tokenizer, surname_pattern):
625
626
        metadata = {"surname_pattern": surname_pattern}
627
        tokens = linked_tokens(["Van den", "Heide", "-", "Ginkel", "is", "de", "naam"])
628
629
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
630
        doc = dd.Document(text="_", metadata=metadata)
631
632
        with patch.object(tokenizer, "tokenize", return_value=surname_pattern):
633
634
            assert ann._match_surname(doc=doc, token=tokens[0]) == (
635
                tokens[0],
636
                tokens[3],
637
            )
638
639
    def test_annotate_first_name(self, tokenizer):
640
641
        metadata = {
642
            "patient": Person(
643
                first_names=["Jan", "Johan"], initials="JJ", surname="Jansen"
644
            )
645
        }
646
        text = "De patient heet Jan"
647
        tokens = tokenizer.tokenize(text)
648
649
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
650
        doc = dd.Document(text=text, metadata=metadata)
651
652
        with patch.object(doc, "get_tokens", return_value=tokens):
653
            with patch.object(
654
                tokenizer, "tokenize", return_value=linked_tokens(["Jansen"])
655
            ):
656
                annotations = ann.annotate(doc)
657
658
        assert annotations == [
659
            dd.Annotation(
660
                text="Jan",
661
                start_char=16,
662
                end_char=19,
663
                tag="voornaam_patient",
664
            )
665
        ]
666
667
    def test_annotate_initials_from_name(self, tokenizer):
668
669
        metadata = {
670
            "patient": Person(
671
                first_names=["Jan", "Johan"], initials="JJ", surname="Jansen"
672
            )
673
        }
674
        text = "De patient heet JJ"
675
        tokens = tokenizer.tokenize(text)
676
677
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
678
        doc = dd.Document(text=text, metadata=metadata)
679
680
        with patch.object(doc, "get_tokens", return_value=tokens):
681
            with patch.object(
682
                tokenizer, "tokenize", return_value=linked_tokens(["Jansen"])
683
            ):
684
                annotations = ann.annotate(doc)
685
686
        assert annotations == [
687
            dd.Annotation(
688
                text="JJ",
689
                start_char=16,
690
                end_char=18,
691
                tag="initiaal_patient",
692
            )
693
        ]
694
695
    def test_annotate_initial(self, tokenizer):
696
697
        metadata = {
698
            "patient": Person(
699
                first_names=["Jan", "Johan"], initials="JJ", surname="Jansen"
700
            )
701
        }
702
        text = "De patient heet J."
703
        tokens = tokenizer.tokenize(text)
704
705
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
706
        doc = dd.Document(text=text, metadata=metadata)
707
708
        with patch.object(doc, "get_tokens", return_value=tokens):
709
            with patch.object(
710
                tokenizer, "tokenize", return_value=linked_tokens(["Jansen"])
711
            ):
712
                annotations = ann.annotate(doc)
713
714
        assert annotations == [
715
            dd.Annotation(
716
                text="J.",
717
                start_char=16,
718
                end_char=18,
719
                tag="initiaal_patient",
720
            )
721
        ]
722
723
    def test_annotate_surname(self, tokenizer):
724
725
        metadata = {
726
            "patient": Person(
727
                first_names=["Jan", "Johan"], initials="JJ", surname="Jansen"
728
            )
729
        }
730
        text = "De patient heet Jansen"
731
        tokens = tokenizer.tokenize(text)
732
733
        ann = PatientNameAnnotator(tokenizer=tokenizer, tag="_")
734
        doc = dd.Document(text=text, metadata=metadata)
735
736
        with patch.object(doc, "get_tokens", return_value=tokens):
737
            with patch.object(
738
                tokenizer, "tokenize", return_value=linked_tokens(["Jansen"])
739
            ):
740
                annotations = ann.annotate(doc)
741
742
        assert annotations == [
743
            dd.Annotation(
744
                text="Jansen",
745
                start_char=16,
746
                end_char=22,
747
                tag="achternaam_patient",
748
            )
749
        ]
750
751
752
class TestRegexpPseudoAnnotator:
753
    def test_is_word_char(self):
754
755
        assert RegexpPseudoAnnotator._is_word_char("a")
756
        assert RegexpPseudoAnnotator._is_word_char("abc")
757
        assert not RegexpPseudoAnnotator._is_word_char("123")
758
        assert not RegexpPseudoAnnotator._is_word_char(" ")
759
        assert not RegexpPseudoAnnotator._is_word_char("\n")
760
        assert not RegexpPseudoAnnotator._is_word_char(".")
761
762
    def test_get_previous_word(self):
763
764
        r = RegexpPseudoAnnotator(regexp_pattern="_", tag="_")
765
766
        assert r._get_previous_word(0, "12 jaar") == ""
767
        assert r._get_previous_word(1, "<12 jaar") == ""
768
        assert r._get_previous_word(8, "patient 12 jaar") == "patient"
769
        assert r._get_previous_word(7, "(sinds 12 jaar)") == "sinds"
770
        assert r._get_previous_word(11, "patient is 12 jaar)") == "is"
771
772
    def test_get_next(self):
773
774
        r = RegexpPseudoAnnotator(regexp_pattern="_", tag="_")
775
776
        assert r._get_next_word(7, "12 jaar") == ""
777
        assert r._get_next_word(7, "12 jaar, geleden") == ""
778
        assert r._get_next_word(7, "12 jaar geleden") == "geleden"
779
        assert r._get_next_word(7, "12 jaar geleden geopereerd") == "geleden"
780
781
    def test_validate_match(self, regexp_pseudo_doc):
782
783
        r = RegexpPseudoAnnotator(regexp_pattern="_", tag="_")
784
        pattern = re.compile(r"\d+ jaar")
785
786
        match = list(pattern.finditer(regexp_pseudo_doc.text))[0]
787
788
        assert r._validate_match(match, regexp_pseudo_doc)
789
790
    def test_validate_match_pre(self, regexp_pseudo_doc):
791
792
        r = RegexpPseudoAnnotator(
793
            regexp_pattern="_", tag="_", pre_pseudo=["sinds", "al", "vanaf"]
794
        )
795
        pattern = re.compile(r"\d+ jaar")
796
797
        match = list(pattern.finditer(regexp_pseudo_doc.text))[0]
798
799
        assert r._validate_match(match, regexp_pseudo_doc)
800
801
    def test_validate_match_post(self, regexp_pseudo_doc):
802
803
        r = RegexpPseudoAnnotator(
804
            regexp_pattern="_", tag="_", post_pseudo=["geleden", "getrouwd", "gestopt"]
805
        )
806
        pattern = re.compile(r"\d+ jaar")
807
808
        match = list(pattern.finditer(regexp_pseudo_doc.text))[0]
809
810
        assert not r._validate_match(match, regexp_pseudo_doc)
811
812
    def test_validate_match_lower(self, regexp_pseudo_doc):
813
814
        r = RegexpPseudoAnnotator(
815
            regexp_pattern="_", tag="_", pre_pseudo=["na"], lowercase=True
816
        )
817
        pattern = re.compile(r"\d+ jaar")
818
819
        match = list(pattern.finditer(regexp_pseudo_doc.text))[0]
820
821
        assert not r._validate_match(match, regexp_pseudo_doc)
822
823
824
class TestBsnAnnotator:
825
    def test_elfproef(self):
826
        an = BsnAnnotator(bsn_regexp="(\\D|^)(\\d{9})(\\D|$)", capture_group=2, tag="_")
827
828
        assert an._elfproef("111222333")
829
        assert not an._elfproef("111222334")
830
        assert an._elfproef("123456782")
831
        assert not an._elfproef("123456783")
832
833
    def test_elfproef_wrong_length(self):
834
        an = BsnAnnotator(bsn_regexp="(\\D|^)(\\d{9})(\\D|$)", capture_group=2, tag="_")
835
836
        with pytest.raises(ValueError):
837
            an._elfproef("12345678")
838
839
    def test_elfproef_non_numeric(self):
840
        an = BsnAnnotator(bsn_regexp="(\\D|^)(\\d{9})(\\D|$)", capture_group=2, tag="_")
841
842
        with pytest.raises(ValueError):
843
            an._elfproef("test")
844
845
    def test_annotate(self, bsn_doc):
846
        an = BsnAnnotator(bsn_regexp="(\\D|^)(\\d{9})(\\D|$)", capture_group=2, tag="_")
847
        annotations = an.annotate(bsn_doc)
848
849
        expected_annotations = [
850
            dd.Annotation(text="111222333", start_char=26, end_char=35, tag="_"),
851
            dd.Annotation(text="123456782", start_char=39, end_char=48, tag="_"),
852
        ]
853
854
        assert annotations == expected_annotations
855
856
    def test_annotate_with_nondigits(self, bsn_doc):
857
        an = BsnAnnotator(bsn_regexp=r"\d{4}\.\d{2}\.\d{3}", tag="_")
858
        doc = dd.Document("1234.56.782")
859
        annotations = an.annotate(doc)
860
861
        expected_annotations = [
862
            dd.Annotation(text="1234.56.782", start_char=0, end_char=11, tag="_"),
863
        ]
864
865
        assert annotations == expected_annotations
866
867
868
class TestPhoneNumberAnnotator:
869
    def test_annotate_defaults(self, phone_number_doc):
870
        an = PhoneNumberAnnotator(
871
            phone_regexp=r"(?<!\d)"
872
            r"(\(?(0031|\+31|0)"
873
            r"(1[035]|2[0347]|3[03568]|4[03456]|5[0358]|6|7|88|800|91|90[069]|"
874
            r"[1-5]\d{2})\)?)"
875
            r" ?-? ?"
876
            r"((\d{2,4}[ -]?)+\d{2,4})",
877
            tag="_",
878
        )
879
        annotations = an.annotate(phone_number_doc)
880
881
        expected_annotations = [
882
            dd.Annotation(text="0314-555555", start_char=21, end_char=32, tag="_"),
883
            dd.Annotation(text="088 755 55 55", start_char=35, end_char=48, tag="_"),
884
            dd.Annotation(text="(06)55555555", start_char=53, end_char=65, tag="_"),
885
            dd.Annotation(text="0800-9003", start_char=135, end_char=144, tag="_"),
886
        ]
887
888
        assert annotations == expected_annotations
889
890
    def test_annotate_short(self, phone_number_doc):
891
        an = PhoneNumberAnnotator(
892
            phone_regexp=r"(?<!\d)"
893
            r"(\(?(0031|\+31|0)"
894
            r"(1[035]|2[0347]|3[03568]|4[03456]|5[0358]|6|7|88|800|91|90[069]|"
895
            r"[1-5]\d{2})\)?)"
896
            r" ?-? ?"
897
            r"((\d{2,4}[ -]?)+\d{2,4})",
898
            min_digits=4,
899
            max_digits=8,
900
            tag="_",
901
        )
902
        annotations = an.annotate(phone_number_doc)
903
904
        expected_annotations = [
905
            dd.Annotation(text="065555", start_char=72, end_char=78, tag="_")
906
        ]
907
908
        assert annotations == expected_annotations
909
910
    def test_annotate_long(self, phone_number_doc):
911
        an = PhoneNumberAnnotator(
912
            phone_regexp=r"(?<!\d)"
913
            r"(\(?(0031|\+31|0)"
914
            r"(1[035]|2[0347]|3[03568]|4[03456]|5[0358]|6|7|88|800|91|90[069]|"
915
            r"[1-5]\d{2})\)?)"
916
            r" ?-? ?"
917
            r"((\d{2,4}[ -]?)+\d{2,4})",
918
            min_digits=11,
919
            max_digits=12,
920
            tag="_",
921
        )
922
        annotations = an.annotate(phone_number_doc)
923
924
        expected_annotations = [
925
            dd.Annotation(text="065555555555", start_char=93, end_char=105, tag="_")
926
        ]
927
928
        assert annotations == expected_annotations