[7fc5df]: / tests / surrogates / test_rewrite_dataset.py

Download this file

114 lines (91 with data), 3.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import argparse
import filecmp
import glob
from os.path import basename, dirname, join
import pytest
from deidentify.base import Annotation
from deidentify.surrogates import rewrite_dataset
def test_apply_surrogates():
text = 'ccc cc ccc c c ccc cccccc cccc'
annotations = [
Annotation('ccc', start=0, end=3, tag='A'),
Annotation('cc', start=4, end=6, tag='A'),
Annotation('ccc', start=15, end=18, tag='B')
]
surrogates = ['a', 'dd', 'bbbbb']
surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates)
assert surrogate_doc.text == 'a dd ccc c c bbbbb cccccc cccc'
assert surrogate_doc.annotations == [
Annotation('a', start=0, end=1, tag='A'),
Annotation('dd', start=2, end=4, tag='A'),
Annotation('bbbbb', start=13, end=18, tag='B')
]
assert surrogate_doc.annotations_without_surrogates == []
def test_apply_surrogates_no_annotations():
surrogate_doc = rewrite_dataset.apply_surrogates('ccc cc ccc', annotations=[], surrogates=[])
assert surrogate_doc.text == 'ccc cc ccc'
assert surrogate_doc.annotations == []
assert surrogate_doc.annotations_without_surrogates == []
def test_apply_surrogates_errors_raise():
text = 'ccc cc ccc'
annotations = [
Annotation('ccc', start=0, end=3, tag='A'),
Annotation('cc', start=4, end=6, tag='A'),
Annotation('ccc', start=7, end=10, tag='B')
]
surrogates = ['a', None, 'b']
with pytest.raises(ValueError):
rewrite_dataset.apply_surrogates(text, annotations, surrogates)
with pytest.raises(ValueError):
rewrite_dataset.apply_surrogates(text, annotations, surrogates, errors='raise')
def test_apply_surrogates_errors_ignore():
text = 'ccc cc ccc'
annotations = [
Annotation('ccc', start=0, end=3, tag='A'),
Annotation('cc', start=4, end=6, tag='A'),
Annotation('ccc', start=7, end=10, tag='B')
]
surrogates = ['a', None, 'b']
surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates, errors='ignore')
assert surrogate_doc.text == 'a cc b'
assert surrogate_doc.annotations == [
Annotation('a', start=0, end=1, tag='A'),
Annotation('cc', start=2, end=4, tag='A'),
Annotation('b', start=5, end=6, tag='B')
]
assert surrogate_doc.annotations_without_surrogates == [
Annotation('cc', start=4, end=6, tag='A'),
]
def test_apply_surrogates_errors_coerce():
text = 'ccc cc ccc'
annotations = [
Annotation('ccc', start=0, end=3, tag='A'),
Annotation('cc', start=4, end=6, tag='A'),
Annotation('ccc', start=7, end=10, tag='B')
]
surrogates = ['a', None, 'b']
surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates, errors='coerce')
assert surrogate_doc.text == 'a [A] b'
assert surrogate_doc.annotations == [
Annotation('a', start=0, end=1, tag='A'),
Annotation('[A]', start=2, end=5, tag='A'),
Annotation('b', start=6, end=7, tag='B')
]
assert surrogate_doc.annotations_without_surrogates == [
Annotation('cc', start=4, end=6, tag='A'),
]
def test_main(tmpdir):
args = argparse.Namespace(
surrogate_table=join(dirname(__file__), 'data/annotations-rewrite-table.csv'),
data_path=join(dirname(__file__), 'data/original'),
output_path=tmpdir
)
ann_files = glob.glob(join(dirname(__file__), 'data/rewritten/*.ann'))
txt_files = glob.glob(join(dirname(__file__), 'data/rewritten/*.txt'))
to_compare = ann_files + txt_files
to_compare = [basename(f) for f in to_compare]
rewrite_dataset.main(args)
for file in to_compare:
expected = join(dirname(__file__), 'data/rewritten/', file)
actual = join(tmpdir, file)
assert filecmp.cmp(expected, actual)