a b/tests/surrogates/test_dataset_deidentifier.py
1
from os.path import dirname, join
2
3
from deidentify.base import Annotation
4
from deidentify.dataset.brat import load_brat_document
5
from deidentify.surrogates.dataset_deidentifier import DatasetDeidentifier, Document
6
7
8
def _load_documents():
9
    doc_names = ['example-1', 'example-2']
10
11
    docs = []
12
    for doc in doc_names:
13
        annotations, text = load_brat_document(join(dirname(__file__), 'data/original'), doc)
14
        docs.append(Document(annotations, text))
15
    return docs
16
17
18
def test_dataset_deidentifier():
19
    docs = _load_documents()
20
    dataset_deidentifier = DatasetDeidentifier()
21
    docs = dataset_deidentifier.generate_surrogates(docs)
22
23
    print()
24
    for doc in docs:
25
        print('===================')
26
        for annotation, surrogate in zip(*doc.annotation_surrogate_pairs()):
27
            print('{:<30} => {:<20} ({})'.format(annotation.text, surrogate, annotation.tag))
28
29
            if annotation.tag == 'Other' or annotation.tag == 'Age':
30
                # other will be replaced manually, and there is no age above 89 in the test corpus
31
                assert annotation.text == surrogate
32
            else:
33
                assert annotation.text != surrogate
34
35
36
def test_generate_surrogates_without_choices():
37
    text = 'Patient is being treated at UMCU.'
38
    annotations = [Annotation('UMCU', text.index('UMCU'), text.index('UMCU') + 4, 'Hospital')]
39
    doc = Document(annotations, text)
40
41
    surrogate_doc = DatasetDeidentifier().generate_surrogates([doc])[0]
42
43
    original_annotations, surrogates = surrogate_doc.annotation_surrogate_pairs()
44
    assert len(original_annotations) == 1
45
    assert len(surrogates) == 1
46
    assert original_annotations[0].text == 'UMCU'
47
    assert surrogates[0] == 'UMCU'
48
49
50
def test_generate_surrogates_shuffle_choices():
51
    text = 'Patient is being treated at UMCU.'
52
    annotations = [Annotation('UMCU', text.index('UMCU'), text.index('UMCU') + 4, 'Hospital')]
53
    doc_1 = Document(annotations, text)
54
55
    text = 'Patient is being treated at MST.'
56
    annotations = [Annotation('MST', text.index('MST'), text.index('MST') + 3, 'Hospital')]
57
    doc_2 = Document(annotations, text)
58
59
    surrogate_docs = DatasetDeidentifier().generate_surrogates([doc_1, doc_2])
60
61
    original_annotations, surrogates = surrogate_docs[0].annotation_surrogate_pairs()
62
    assert len(original_annotations) == 1 and len(surrogates) == 1
63
    assert original_annotations[0].text == 'UMCU'
64
    assert surrogates[0] == 'MST'
65
66
    original_annotations, surrogates = surrogate_docs[1].annotation_surrogate_pairs()
67
    assert len(original_annotations) == 1 and len(surrogates) == 1
68
    assert original_annotations[0].text == 'MST'
69
    assert surrogates[0] == 'UMCU'