deduce / Git / [79668b] /tests/unit/test_annotation

Models:
philipB/
deduce
Downloads: 1
[79668b]: / tests / unit / test_annotation_processor.py
History
Download this file
238 lines (195 with data), 7.7 kB

import docdeid as dd

from deduce.annotation_processor import (
    CleanAnnotationTag,
    DeduceMergeAdjacentAnnotations,
    PersonAnnotationConverter,
    RemoveAnnotations,
)


class TestDeduceMergeAdjacent:
    def test_tags_match(self):
        proc = DeduceMergeAdjacentAnnotations()

        assert proc._tags_match("a", "a")
        assert proc._tags_match("huisnummer", "huisnummer")
        assert proc._tags_match("patient", "patient")
        assert proc._tags_match("persoon", "persoon")
        assert proc._tags_match("patient", "persoon")
        assert proc._tags_match("persoon", "patient")

        assert not proc._tags_match("a", "b")
        assert not proc._tags_match("patient", "huisnummer")
        assert not proc._tags_match("huisnummer", "patient")
        assert not proc._tags_match("persoon", "huisnummer")
        assert not proc._tags_match("huisnummer", "persoon")

    def test_annotation_replacement_equal_tags(self):
        proc = DeduceMergeAdjacentAnnotations()
        text = "Jan Jansen"
        left_annotation = dd.Annotation(
            text="Jan", start_char=0, end_char=3, tag="naam"
        )
        right_annotation = dd.Annotation(
            text="Jansen", start_char=4, end_char=10, tag="naam"
        )
        expected_annotation = dd.Annotation(
            text="Jan Jansen", start_char=0, end_char=10, tag="naam"
        )

        assert (
            proc._adjacent_annotations_replacement(
                left_annotation, right_annotation, text
            )
            == expected_annotation
        )

    def test_annotation_replacement_unequal_tags(self):
        proc = DeduceMergeAdjacentAnnotations()
        text = "Jan Jansen"
        left_annotation = dd.Annotation(
            text="Jan", start_char=0, end_char=3, tag="voornaam_patient"
        )
        right_annotation = dd.Annotation(
            text="Jansen", start_char=4, end_char=10, tag="achternaam_patient"
        )
        expected_annotation = dd.Annotation(
            text="Jan Jansen", start_char=0, end_char=10, tag="patient"
        )

        assert (
            proc._adjacent_annotations_replacement(
                left_annotation, right_annotation, text
            )
            == expected_annotation
        )


class TestPersonAnnotationConverter:
    def test_patient_no_overlap(self):
        proc = PersonAnnotationConverter()
        text = "Jan Jansen"

        annotations = dd.AnnotationSet(
            [
                dd.Annotation(
                    text="Jan", start_char=0, end_char=3, tag="voornaam_patient"
                ),
                dd.Annotation(
                    text="Jansen", start_char=4, end_char=10, tag="achternaam_patient"
                ),
            ]
        )

        expected_annotations = dd.AnnotationSet(
            [
                dd.Annotation(text="Jan", start_char=0, end_char=3, tag="patient"),
                dd.Annotation(text="Jansen", start_char=4, end_char=10, tag="patient"),
            ]
        )

        assert proc.process_annotations(annotations, text) == expected_annotations

    def test_patient_with_overlap(self):
        proc = PersonAnnotationConverter()
        text = "Jan Jansen"

        annotations = dd.AnnotationSet(
            [
                dd.Annotation(
                    text="Jan", start_char=0, end_char=3, tag="voornaam_patient"
                ),
                dd.Annotation(
                    text="Jan Jansen", start_char=0, end_char=10, tag="naam_patient"
                ),
            ]
        )

        expected_annotations = dd.AnnotationSet(
            [dd.Annotation(text="Jan Jansen", start_char=0, end_char=10, tag="patient")]
        )

        assert proc.process_annotations(annotations, text) == expected_annotations

    def test_mixed_no_overlap(self):
        proc = PersonAnnotationConverter()
        text = "Jan Jansen"

        annotations = dd.AnnotationSet(
            [
                dd.Annotation(
                    text="Jan", start_char=0, end_char=3, tag="voornaam_patient"
                ),
                dd.Annotation(
                    text="Jansen", start_char=4, end_char=10, tag="achternaam_onbekend"
                ),
            ]
        )

        expected_annotations = dd.AnnotationSet(
            [
                dd.Annotation(text="Jan", start_char=0, end_char=3, tag="patient"),
                dd.Annotation(text="Jansen", start_char=4, end_char=10, tag="persoon"),
            ]
        )

        assert proc.process_annotations(annotations, text) == expected_annotations

    def test_mixed_with_overlap(self):
        proc = PersonAnnotationConverter()
        text = "Jan Jansen"

        annotations = dd.AnnotationSet(
            [
                dd.Annotation(
                    text="Jan", start_char=0, end_char=3, tag="voornaam_patient"
                ),
                dd.Annotation(
                    text="Jan Jansen", start_char=0, end_char=10, tag="naam_onbekend"
                ),
            ]
        )

        expected_annotations = dd.AnnotationSet(
            [
                dd.Annotation(text="Jan", start_char=0, end_char=3, tag="patient"),
                dd.Annotation(text=" Jansen", start_char=3, end_char=10, tag="persoon"),
            ]
        )

        assert proc.process_annotations(annotations, text) == expected_annotations

    def test_pseudo(self):

        proc = PersonAnnotationConverter()
        text = "Henoch Schonlein"

        annotations = dd.AnnotationSet(
            [
                dd.Annotation(text="Henoch", start_char=0, end_char=6, tag="voornaam"),
                dd.Annotation(
                    text="Henoch Schonlein",
                    start_char=0,
                    end_char=16,
                    tag="pseudo_naam",
                ),
            ]
        )

        assert proc.process_annotations(annotations, text) == dd.AnnotationSet()


class TestRemoveAnnotations:
    def test_remove_annotations(self):

        ra = RemoveAnnotations(tags=["voornaam_patient", "nonexisting_tag"])

        annotations = dd.AnnotationSet(
            [
                dd.Annotation(
                    text="Jan", start_char=0, end_char=3, tag="voornaam_patient"
                ),
                dd.Annotation(
                    text="Jansen", start_char=4, end_char=10, tag="achternaam_patient"
                ),
            ]
        )

        processed_annotations = ra.process_annotations(annotations, text="_")

        assert processed_annotations == dd.AnnotationSet(
            [
                dd.Annotation(
                    text="Jansen", start_char=4, end_char=10, tag="achternaam_patient"
                )
            ]
        )


class TestCleanAnnotationTag:
    def test_remove_annotations(self):

        cat = CleanAnnotationTag(
            tag_map={"voornaam_patient": "voornaam", "nonexistent": "test"}
        )

        annotations = dd.AnnotationSet(
            [
                dd.Annotation(
                    text="Jan", start_char=0, end_char=3, tag="voornaam_patient"
                ),
                dd.Annotation(
                    text="Jansen", start_char=4, end_char=10, tag="achternaam_patient"
                ),
            ]
        )

        processed_annotations = cat.process_annotations(annotations, text="_")

        assert processed_annotations == dd.AnnotationSet(
            [
                dd.Annotation(text="Jan", start_char=0, end_char=3, tag="voornaam"),
                dd.Annotation(
                    text="Jansen", start_char=4, end_char=10, tag="achternaam_patient"
                ),
            ]
        )