Switch to unified view

a b/tests/methods/test_flair_utils.py
1
from flair.data import Sentence
2
3
from deidentify.base import Annotation, Document
4
from deidentify.dataset.corpus_loader import DUMMY_CORPUS, CorpusLoader
5
from deidentify.methods.bilstmcrf import flair_utils
6
from deidentify.tokenizer import TokenizerFactory
7
8
9
def test_standoff_to_flair_sents():
10
    corpus = CorpusLoader().load_corpus(DUMMY_CORPUS)
11
    tokenizer = TokenizerFactory().tokenizer('ons')
12
    docs = corpus.train
13
    sents, parsed_docs = flair_utils.standoff_to_flair_sents(docs, tokenizer)
14
15
    assert len(sents) == 14
16
    assert len(parsed_docs) == 14
17
18
    bio_tags = [token.get_tag('ner').value for token in sents[0]]
19
    token_texts = [token.text for token in sents[0]]
20
    assert token_texts == [
21
        'Linders',
22
        ',',
23
        'Xandro',
24
        '<'
25
    ]
26
    assert bio_tags == [
27
        'B-Name',
28
        'I-Name',
29
        'I-Name',
30
        'O'
31
    ]
32
33
    bio_tags = [token.get_tag('ner').value for token in sents[1]]
34
    token_texts = [token.text for token in sents[1]]
35
    assert token_texts == [
36
        't.njg.nmmeso@rcrmb.nl'
37
    ]
38
    assert bio_tags == [
39
        'B-Email'
40
    ]
41
42
    bio_tags = [token.get_tag('ner').value for token in sents[2]]
43
    token_texts = [token.text for token in sents[2]]
44
    assert token_texts == [
45
        '>',
46
        '<SPACE>',
47
        '07',
48
        'apr',
49
        '.',
50
        '<SPACE>'
51
    ]
52
    assert bio_tags == [
53
        'O',
54
        'O',
55
        'B-Date',
56
        'I-Date',
57
        'O',
58
        'O',
59
    ]
60
61
62
def test_flair_sents_to_standoff():
63
    corpus = CorpusLoader().load_corpus(DUMMY_CORPUS)
64
    tokenizer = TokenizerFactory().tokenizer('ons')
65
    docs_expected = corpus.train
66
67
    sents, parsed_docs = flair_utils.standoff_to_flair_sents(docs_expected, tokenizer)
68
    docs_actual = flair_utils.flair_sents_to_standoff(sents, parsed_docs)
69
70
    assert len(docs_actual) == 1
71
    assert len(docs_expected) == 1
72
73
    assert len(docs_actual[0].annotations) == 16
74
    assert len(docs_expected[0].annotations) == 16
75
76
    for ann_expected, ann_actual in zip(docs_expected[0].annotations, docs_actual[0].annotations):
77
        assert ann_expected.text == ann_actual.text
78
        assert ann_expected.tag == ann_actual.tag
79
80
81
def test_filtered_corpus():
82
    def ignore_sentence(sent):
83
        return sent[0].text.startswith('===')
84
85
    filtered_corpus = flair_utils.FilteredCorpus(
86
        train=[Sentence('=== Answer: 123 ==='), Sentence('this is should be included')],
87
        dev=[Sentence('this is should be included'), Sentence('=== Answer: 456 ===')],
88
        test=[Sentence('this is should be included'), Sentence('and this as well')],
89
        ignore_sentence=ignore_sentence
90
    )
91
92
    assert len(filtered_corpus.train) == 1
93
    assert filtered_corpus.train[0].to_plain_string() == 'this is should be included'
94
    assert len(filtered_corpus.dev) == 1
95
    assert filtered_corpus.dev[0].to_plain_string() == 'this is should be included'
96
    assert len(filtered_corpus.test) == 2
97
    assert filtered_corpus.test[0].to_plain_string() == 'this is should be included'
98
    assert filtered_corpus.test[1].to_plain_string() == 'and this as well'
99
100
    assert len(filtered_corpus.train_ignored) == 1
101
    assert filtered_corpus.train_ignored[0].to_plain_string() == '=== Answer: 123 ==='
102
    assert len(filtered_corpus.dev_ignored) == 1
103
    assert filtered_corpus.dev_ignored[0].to_plain_string() == '=== Answer: 456 ==='
104
    assert len(filtered_corpus.test_ignored) == 0
105
106
107
def test_flair_sentence_with_whitespace_tokens():
108
    text = 'Mw geniet zichtbaar.  Maarten is de afgelopen periode veelal afwezig.'
109
    annotation = Annotation(
110
        text='Maarten',
111
        start=text.index('Maarten'),
112
        end=text.index('Maarten') + len('Maarten'),
113
        tag='PERSON'
114
    )
115
    doc = Document(name='', text=text, annotations=[annotation])
116
117
    tokenizer = TokenizerFactory().tokenizer('ons')
118
    flair_sents, docs = flair_utils.standoff_to_flair_sents([doc], tokenizer)
119
120
    # spaCy adds consecutive whitespace tokens as a single whitespace. These should be retained
121
    # in the Flair sentence, otherwise it's not possible to reconstruct the original document from
122
    # the tokenized representation.
123
    assert [token.text for token in flair_sents[0]] == ['Mw', 'geniet', 'zichtbaar', '.', '<SPACE>']
124
125
    spacy_doc = docs[0].spacy_doc
126
    spacy_sents = list(spacy_doc.sents)
127
    assert len(flair_sents) == 2
128
    assert len(spacy_sents) == 2
129
130
    assert len(flair_sents[0]) == 5
131
    assert len(spacy_sents[0]) == 5
132
    assert len(flair_sents[1]) == 8
133
    assert len(spacy_sents[1]) == 8