a b/test/python/test_combined.py
1
from typing import List, Dict, Set, Any
2
from collections import OrderedDict
3
import json
4
from zensols.nlp import FeatureToken, FeatureDocument, FeatureDocumentParser
5
from util import TestBase
6
7
8
class TestCombinedParsers(TestBase):
9
    _DEFAULT_ATTRS = 'i i_sent idx norm ent_ ent_iob ent_iob_'.split()
10
    _TRACE = False
11
12
    def _compare_sents(self, parser_name: str, idx: int, write: bool,
13
                       sent: str, attrs: List[str], missing: Set[str]):
14
        def map_tok_features(t: FeatureToken) -> Dict[str, Any]:
15
            # sort keys to make diffing easier
16
            dct = t.asdict()
17
            return OrderedDict(sorted(dct.items(), key=lambda t: t[0]))
18
19
        actual_file: str = f'test-resources/should/{parser_name}-{idx}.json'
20
        p: FeatureDocumentParser = self._get_doc_parser('combined', parser_name)
21
        doc: FeatureDocument = p(sent)
22
23
        actuals = tuple(map(map_tok_features, doc.token_iter()))
24
        if self._TRACE:
25
            from pprint import pprint
26
            pprint(actuals)
27
            return
28
        if write:
29
            print(sent)
30
            with open(actual_file, 'w') as f:
31
                json.dump(actuals, f, indent=4, sort_keys=False)
32
            for attr in attrs:
33
                vals: str = ', '.join(map(lambda d: str(d[attr]), actuals))
34
                print(f'  {attr}: {vals}')
35
            print('_' * 79)
36
37
        with open(actual_file) as f:
38
            shoulds: List[Dict[str, Any]] = json.load(f)
39
40
        attr: str
41
        for attr in attrs:
42
            actual = tuple(map(lambda d: d[attr], actuals))
43
            should = tuple(map(lambda d: d[attr], shoulds))
44
            self.assertEqual(should, actual, f'for attribute: <{attr}>')
45
46
        if missing is not None:
47
            for attr in missing:
48
                for tok in actuals:
49
                    self.assertFalse(hasattr(tok, attr),
50
                                     f'expected missing {attr} in {tok}')
51
52
    def _compare(self, parser_name: str, write: bool = False,
53
                 attrs: List[str] = None, missing: Set[str] = None):
54
        attrs = TestCombinedParsers._DEFAULT_ATTRS if attrs is None else attrs
55
        for i in range(2):
56
            sent: str = getattr(self, f'text_{i + 1}')
57
            self._compare_sents(parser_name, i, write, sent, attrs, missing)
58
59
    def test_default(self):
60
        self._compare('doc_parser', missing='cui_'.split())
61
62
    def test_biomed_ner(self):
63
        self._compare('mednlp_biomed_doc_parser', missing='cui_'.split())
64
65
    def test_medcat(self):
66
        self._compare('mednlp_medcat_doc_parser',
67
                      attrs=self._DEFAULT_ATTRS + 'cui_ tuis_'.split())
68
69
    def test_biomed_combined(self):
70
        self._compare('mednlp_combine_biomed_doc_parser',
71
                      missing='cui_'.split())
72
73
    def test_medcat_combined(self):
74
        self._compare('mednlp_combine_medcat_doc_parser',
75
                      attrs=self._DEFAULT_ATTRS + 'cui_ tuis_'.split())
76
77
    def test_medcat_biomded_combined(self):
78
        self._compare('mednlp_combine_biomed_medcat_doc_parser',
79
                      attrs=self._DEFAULT_ATTRS + 'cui_ tuis_'.split())