|
a |
|
b/tests/surrogates/test_dataset_deidentifier.py |
|
|
1 |
from os.path import dirname, join |
|
|
2 |
|
|
|
3 |
from deidentify.base import Annotation |
|
|
4 |
from deidentify.dataset.brat import load_brat_document |
|
|
5 |
from deidentify.surrogates.dataset_deidentifier import DatasetDeidentifier, Document |
|
|
6 |
|
|
|
7 |
|
|
|
8 |
def _load_documents(): |
|
|
9 |
doc_names = ['example-1', 'example-2'] |
|
|
10 |
|
|
|
11 |
docs = [] |
|
|
12 |
for doc in doc_names: |
|
|
13 |
annotations, text = load_brat_document(join(dirname(__file__), 'data/original'), doc) |
|
|
14 |
docs.append(Document(annotations, text)) |
|
|
15 |
return docs |
|
|
16 |
|
|
|
17 |
|
|
|
18 |
def test_dataset_deidentifier(): |
|
|
19 |
docs = _load_documents() |
|
|
20 |
dataset_deidentifier = DatasetDeidentifier() |
|
|
21 |
docs = dataset_deidentifier.generate_surrogates(docs) |
|
|
22 |
|
|
|
23 |
print() |
|
|
24 |
for doc in docs: |
|
|
25 |
print('===================') |
|
|
26 |
for annotation, surrogate in zip(*doc.annotation_surrogate_pairs()): |
|
|
27 |
print('{:<30} => {:<20} ({})'.format(annotation.text, surrogate, annotation.tag)) |
|
|
28 |
|
|
|
29 |
if annotation.tag == 'Other' or annotation.tag == 'Age': |
|
|
30 |
# other will be replaced manually, and there is no age above 89 in the test corpus |
|
|
31 |
assert annotation.text == surrogate |
|
|
32 |
else: |
|
|
33 |
assert annotation.text != surrogate |
|
|
34 |
|
|
|
35 |
|
|
|
36 |
def test_generate_surrogates_without_choices(): |
|
|
37 |
text = 'Patient is being treated at UMCU.' |
|
|
38 |
annotations = [Annotation('UMCU', text.index('UMCU'), text.index('UMCU') + 4, 'Hospital')] |
|
|
39 |
doc = Document(annotations, text) |
|
|
40 |
|
|
|
41 |
surrogate_doc = DatasetDeidentifier().generate_surrogates([doc])[0] |
|
|
42 |
|
|
|
43 |
original_annotations, surrogates = surrogate_doc.annotation_surrogate_pairs() |
|
|
44 |
assert len(original_annotations) == 1 |
|
|
45 |
assert len(surrogates) == 1 |
|
|
46 |
assert original_annotations[0].text == 'UMCU' |
|
|
47 |
assert surrogates[0] == 'UMCU' |
|
|
48 |
|
|
|
49 |
|
|
|
50 |
def test_generate_surrogates_shuffle_choices(): |
|
|
51 |
text = 'Patient is being treated at UMCU.' |
|
|
52 |
annotations = [Annotation('UMCU', text.index('UMCU'), text.index('UMCU') + 4, 'Hospital')] |
|
|
53 |
doc_1 = Document(annotations, text) |
|
|
54 |
|
|
|
55 |
text = 'Patient is being treated at MST.' |
|
|
56 |
annotations = [Annotation('MST', text.index('MST'), text.index('MST') + 3, 'Hospital')] |
|
|
57 |
doc_2 = Document(annotations, text) |
|
|
58 |
|
|
|
59 |
surrogate_docs = DatasetDeidentifier().generate_surrogates([doc_1, doc_2]) |
|
|
60 |
|
|
|
61 |
original_annotations, surrogates = surrogate_docs[0].annotation_surrogate_pairs() |
|
|
62 |
assert len(original_annotations) == 1 and len(surrogates) == 1 |
|
|
63 |
assert original_annotations[0].text == 'UMCU' |
|
|
64 |
assert surrogates[0] == 'MST' |
|
|
65 |
|
|
|
66 |
original_annotations, surrogates = surrogate_docs[1].annotation_surrogate_pairs() |
|
|
67 |
assert len(original_annotations) == 1 and len(surrogates) == 1 |
|
|
68 |
assert original_annotations[0].text == 'MST' |
|
|
69 |
assert surrogates[0] == 'UMCU' |