edsnlp / Git / [cad161] /tests/utils/test_span

Models:
philipB/
edsnlp
Downloads: 1
[cad161]: / tests / utils / test_span_getters.py
History
Download this file
173 lines (144 with data), 5.4 kB

import pytest
from confit import validate_arguments

import edsnlp
import edsnlp.pipes as eds
from edsnlp.utils.span_getters import (
    ContextWindow,
    get_spans,
    make_span_context_getter,
    validate_span_setter,
)


def test_span_context_getter(lang):
    nlp = edsnlp.blank(lang)
    nlp.add_pipe("eds.normalizer")
    nlp.add_pipe("eds.sentences")
    nlp.add_pipe("eds.matcher", config={"terms": {"sentence": "sentence"}})
    doc = nlp(
        "This is a sentence. "
        "This is another sentence. "
        "This is a third one. "
        "Last sentence."
    )

    span_getter = make_span_context_getter(
        context_words=2,
        context_sents=1,
    )
    assert [span_getter(s).text for s in doc.ents] == [
        "This is a sentence. This",
        "This is another sentence. This",
        ". Last sentence.",
    ]

    span_getter = make_span_context_getter(
        context_words=0,
        context_sents=1,
    )
    assert [span_getter(s).text for s in doc.ents] == [
        "This is a sentence.",
        "This is another sentence.",
        "Last sentence.",
    ]

    span_getter = make_span_context_getter(
        context_words=0,
        context_sents=2,
    )
    assert [span_getter(s).text for s in doc.ents] == [
        "This is a sentence. This is another sentence.",
        "This is a sentence. This is another sentence. This is a third one.",
        "This is a third one. Last sentence.",
    ]


def test_span_getter_on_span():
    nlp = edsnlp.blank("eds")
    nlp.add_pipe(eds.sentences())
    nlp.add_pipe(
        eds.matcher(
            terms={"animal": ["snake", "dog"]},
            span_setter=["ents", "animals"],
        )
    )
    doc = nlp(
        "There was a snake. "
        "His friend was a dog. "
        "He liked baking cakes. "
        "But since he had no hands, he was a bad baker. "
    )
    sents = list(doc.sents)
    assert str(list(get_spans(sents[0], validate_span_setter("ents")))) == "[snake]"
    assert str(list(get_spans(sents[0], validate_span_setter("animals")))) == "[snake]"
    assert str(list(get_spans(doc[5:], validate_span_setter("animals")))) == "[dog]"
    assert str(list(get_spans(doc[5:], validate_span_setter("*")))) == "[dog]"


def test_span_context_getter_asymmetric(lang):
    nlp = edsnlp.blank(lang)
    nlp.add_pipe("eds.normalizer")
    nlp.add_pipe("eds.sentences")
    nlp.add_pipe("eds.matcher", config={"terms": {"animal": "kangaroo"}})
    doc = nlp(
        "This is a sentence. "
        "This is another sentence with a kangaroo. "
        "This is a third one. "
        "Last sentence."
    )

    span_getter = make_span_context_getter(context_words=2, context_sents=0)
    assert [span_getter(s).text for s in doc.ents] == [
        "with a kangaroo. This",
    ]

    span_getter = make_span_context_getter(context_words=(2, 1), context_sents=0)
    assert [span_getter(s).text for s in doc.ents] == [
        "with a kangaroo.",
    ]

    span_getter = make_span_context_getter(context_words=(1, 2), context_sents=0)
    assert [span_getter(s).text for s in doc.ents] == [
        "a kangaroo. This",
    ]

    span_getter = make_span_context_getter(context_words=0, context_sents=(1, 2))
    assert [span_getter(s).text for s in doc.ents] == [
        "This is another sentence with a kangaroo. This is a third one.",
    ]

    span_getter = make_span_context_getter(context_words=0, context_sents=(2, 2))
    assert [span_getter(s).text for s in doc.ents] == [
        "This is a sentence. This is another sentence with a kangaroo. This is a third one."  # noqa: E501
    ]

    span_getter = make_span_context_getter(context_words=0, context_sents=(1, 1))
    assert [span_getter(s).text for s in doc.ents] == [
        "This is another sentence with a kangaroo."
    ]

    span_getter = make_span_context_getter(context_words=(1000, 0), context_sents=0)
    assert [span_getter(s).text for s in doc.ents] == [
        "This is a sentence. This is another sentence with a kangaroo"
    ]

    span_getter = make_span_context_getter(
        context_words=(1000, 0), context_sents=(1, 2)
    )
    assert [span_getter(s).text for s in doc.ents] == [
        "This is a sentence. This is another sentence with a kangaroo. This is a third one."  # noqa: E501
    ]


def test_context_getter_syntax():
    @validate_arguments
    def get_snippet(span, context: ContextWindow):
        return context(span)

    nlp = edsnlp.blank("eds")
    nlp.add_pipe("eds.normalizer")
    nlp.add_pipe("eds.sentences")
    nlp.add_pipe("eds.matcher", config={"terms": {"dog": "dog"}})
    doc = nlp(
        "There was a snake. "
        "His friend was a dog. "
        "He liked baking cakes. "
        "But since he had no hands, he was a bad baker. "
    )

    assert (
        get_snippet(doc.ents[0], "words[-5:5]").text
        == ". His friend was a dog. He liked baking cakes"
    )

    assert get_snippet(doc.ents[0], "words[-5:5] & sent").text == "His friend was a dog"

    assert (
        get_snippet(doc.ents[0], "words[-5:8] | sents[-1:1]").text
        == "There was a snake. His friend was a dog. He liked baking cakes. "
        "But since"
    )


def test_invalid_context_getter_syntax():
    @validate_arguments
    def apply_context(context: ContextWindow):
        pass

    apply_context("sents[-2:2]")

    with pytest.raises(ValueError):
        apply_context("stuff[-2:2]")