deidentify / Git / [7fc5df] /tests/surrogates/test_rewrite

Models:
philipB/
deidentify
Downloads: 1
[7fc5df]: / tests / surrogates / test_rewrite_dataset.py
History
Download this file
114 lines (91 with data), 3.9 kB

import argparse
import filecmp
import glob
from os.path import basename, dirname, join

import pytest

from deidentify.base import Annotation
from deidentify.surrogates import rewrite_dataset


def test_apply_surrogates():
    text = 'ccc cc ccc c c ccc cccccc cccc'
    annotations = [
        Annotation('ccc', start=0, end=3, tag='A'),
        Annotation('cc', start=4, end=6, tag='A'),
        Annotation('ccc', start=15, end=18, tag='B')
    ]
    surrogates = ['a', 'dd', 'bbbbb']

    surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates)
    assert surrogate_doc.text == 'a dd ccc c c bbbbb cccccc cccc'
    assert surrogate_doc.annotations == [
        Annotation('a', start=0, end=1, tag='A'),
        Annotation('dd', start=2, end=4, tag='A'),
        Annotation('bbbbb', start=13, end=18, tag='B')
    ]
    assert surrogate_doc.annotations_without_surrogates == []


def test_apply_surrogates_no_annotations():
    surrogate_doc = rewrite_dataset.apply_surrogates('ccc cc ccc', annotations=[], surrogates=[])
    assert surrogate_doc.text == 'ccc cc ccc'
    assert surrogate_doc.annotations == []
    assert surrogate_doc.annotations_without_surrogates == []


def test_apply_surrogates_errors_raise():
    text = 'ccc cc ccc'
    annotations = [
        Annotation('ccc', start=0, end=3, tag='A'),
        Annotation('cc', start=4, end=6, tag='A'),
        Annotation('ccc', start=7, end=10, tag='B')
    ]
    surrogates = ['a', None, 'b']

    with pytest.raises(ValueError):
        rewrite_dataset.apply_surrogates(text, annotations, surrogates)

    with pytest.raises(ValueError):
        rewrite_dataset.apply_surrogates(text, annotations, surrogates, errors='raise')


def test_apply_surrogates_errors_ignore():
    text = 'ccc cc ccc'
    annotations = [
        Annotation('ccc', start=0, end=3, tag='A'),
        Annotation('cc', start=4, end=6, tag='A'),
        Annotation('ccc', start=7, end=10, tag='B')
    ]
    surrogates = ['a', None, 'b']

    surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates, errors='ignore')
    assert surrogate_doc.text == 'a cc b'
    assert surrogate_doc.annotations == [
        Annotation('a', start=0, end=1, tag='A'),
        Annotation('cc', start=2, end=4, tag='A'),
        Annotation('b', start=5, end=6, tag='B')
    ]
    assert surrogate_doc.annotations_without_surrogates == [
        Annotation('cc', start=4, end=6, tag='A'),
    ]


def test_apply_surrogates_errors_coerce():
    text = 'ccc cc ccc'
    annotations = [
        Annotation('ccc', start=0, end=3, tag='A'),
        Annotation('cc', start=4, end=6, tag='A'),
        Annotation('ccc', start=7, end=10, tag='B')
    ]
    surrogates = ['a', None, 'b']

    surrogate_doc = rewrite_dataset.apply_surrogates(text, annotations, surrogates, errors='coerce')
    assert surrogate_doc.text == 'a [A] b'
    assert surrogate_doc.annotations == [
        Annotation('a', start=0, end=1, tag='A'),
        Annotation('[A]', start=2, end=5, tag='A'),
        Annotation('b', start=6, end=7, tag='B')
    ]
    assert surrogate_doc.annotations_without_surrogates == [
        Annotation('cc', start=4, end=6, tag='A'),
    ]


def test_main(tmpdir):
    args = argparse.Namespace(
        surrogate_table=join(dirname(__file__), 'data/annotations-rewrite-table.csv'),
        data_path=join(dirname(__file__), 'data/original'),
        output_path=tmpdir
    )

    ann_files = glob.glob(join(dirname(__file__), 'data/rewritten/*.ann'))
    txt_files = glob.glob(join(dirname(__file__), 'data/rewritten/*.txt'))
    to_compare = ann_files + txt_files
    to_compare = [basename(f) for f in to_compare]

    rewrite_dataset.main(args)

    for file in to_compare:
        expected = join(dirname(__file__), 'data/rewritten/', file)
        actual = join(tmpdir, file)
        assert filecmp.cmp(expected, actual)