deidentify / Git / [7fc5df] /tests/methods/test_flair

Models:
philipB/
deidentify
Downloads: 1
[7fc5df]: / tests / methods / test_flair_utils.py
History
Download this file
134 lines (109 with data), 4.6 kB

from flair.data import Sentence

from deidentify.base import Annotation, Document
from deidentify.dataset.corpus_loader import DUMMY_CORPUS, CorpusLoader
from deidentify.methods.bilstmcrf import flair_utils
from deidentify.tokenizer import TokenizerFactory


def test_standoff_to_flair_sents():
    corpus = CorpusLoader().load_corpus(DUMMY_CORPUS)
    tokenizer = TokenizerFactory().tokenizer('ons')
    docs = corpus.train
    sents, parsed_docs = flair_utils.standoff_to_flair_sents(docs, tokenizer)

    assert len(sents) == 14
    assert len(parsed_docs) == 14

    bio_tags = [token.get_tag('ner').value for token in sents[0]]
    token_texts = [token.text for token in sents[0]]
    assert token_texts == [
        'Linders',
        ',',
        'Xandro',
        '<'
    ]
    assert bio_tags == [
        'B-Name',
        'I-Name',
        'I-Name',
        'O'
    ]

    bio_tags = [token.get_tag('ner').value for token in sents[1]]
    token_texts = [token.text for token in sents[1]]
    assert token_texts == [
        't.njg.nmmeso@rcrmb.nl'
    ]
    assert bio_tags == [
        'B-Email'
    ]

    bio_tags = [token.get_tag('ner').value for token in sents[2]]
    token_texts = [token.text for token in sents[2]]
    assert token_texts == [
        '>',
        '<SPACE>',
        '07',
        'apr',
        '.',
        '<SPACE>'
    ]
    assert bio_tags == [
        'O',
        'O',
        'B-Date',
        'I-Date',
        'O',
        'O',
    ]


def test_flair_sents_to_standoff():
    corpus = CorpusLoader().load_corpus(DUMMY_CORPUS)
    tokenizer = TokenizerFactory().tokenizer('ons')
    docs_expected = corpus.train

    sents, parsed_docs = flair_utils.standoff_to_flair_sents(docs_expected, tokenizer)
    docs_actual = flair_utils.flair_sents_to_standoff(sents, parsed_docs)

    assert len(docs_actual) == 1
    assert len(docs_expected) == 1

    assert len(docs_actual[0].annotations) == 16
    assert len(docs_expected[0].annotations) == 16

    for ann_expected, ann_actual in zip(docs_expected[0].annotations, docs_actual[0].annotations):
        assert ann_expected.text == ann_actual.text
        assert ann_expected.tag == ann_actual.tag


def test_filtered_corpus():
    def ignore_sentence(sent):
        return sent[0].text.startswith('===')

    filtered_corpus = flair_utils.FilteredCorpus(
        train=[Sentence('=== Answer: 123 ==='), Sentence('this is should be included')],
        dev=[Sentence('this is should be included'), Sentence('=== Answer: 456 ===')],
        test=[Sentence('this is should be included'), Sentence('and this as well')],
        ignore_sentence=ignore_sentence
    )

    assert len(filtered_corpus.train) == 1
    assert filtered_corpus.train[0].to_plain_string() == 'this is should be included'
    assert len(filtered_corpus.dev) == 1
    assert filtered_corpus.dev[0].to_plain_string() == 'this is should be included'
    assert len(filtered_corpus.test) == 2
    assert filtered_corpus.test[0].to_plain_string() == 'this is should be included'
    assert filtered_corpus.test[1].to_plain_string() == 'and this as well'

    assert len(filtered_corpus.train_ignored) == 1
    assert filtered_corpus.train_ignored[0].to_plain_string() == '=== Answer: 123 ==='
    assert len(filtered_corpus.dev_ignored) == 1
    assert filtered_corpus.dev_ignored[0].to_plain_string() == '=== Answer: 456 ==='
    assert len(filtered_corpus.test_ignored) == 0


def test_flair_sentence_with_whitespace_tokens():
    text = 'Mw geniet zichtbaar.  Maarten is de afgelopen periode veelal afwezig.'
    annotation = Annotation(
        text='Maarten',
        start=text.index('Maarten'),
        end=text.index('Maarten') + len('Maarten'),
        tag='PERSON'
    )
    doc = Document(name='', text=text, annotations=[annotation])

    tokenizer = TokenizerFactory().tokenizer('ons')
    flair_sents, docs = flair_utils.standoff_to_flair_sents([doc], tokenizer)

    # spaCy adds consecutive whitespace tokens as a single whitespace. These should be retained
    # in the Flair sentence, otherwise it's not possible to reconstruct the original document from
    # the tokenized representation.
    assert [token.text for token in flair_sents[0]] == ['Mw', 'geniet', 'zichtbaar', '.', '<SPACE>']

    spacy_doc = docs[0].spacy_doc
    spacy_sents = list(spacy_doc.sents)
    assert len(flair_sents) == 2
    assert len(spacy_sents) == 2

    assert len(flair_sents[0]) == 5
    assert len(spacy_sents[0]) == 5
    assert len(flair_sents[1]) == 8
    assert len(spacy_sents[1]) == 8