|
a |
|
b/test/python/test_combined.py |
|
|
1 |
from typing import List, Dict, Set, Any |
|
|
2 |
from collections import OrderedDict |
|
|
3 |
import json |
|
|
4 |
from zensols.nlp import FeatureToken, FeatureDocument, FeatureDocumentParser |
|
|
5 |
from util import TestBase |
|
|
6 |
|
|
|
7 |
|
|
|
8 |
class TestCombinedParsers(TestBase): |
|
|
9 |
_DEFAULT_ATTRS = 'i i_sent idx norm ent_ ent_iob ent_iob_'.split() |
|
|
10 |
_TRACE = False |
|
|
11 |
|
|
|
12 |
def _compare_sents(self, parser_name: str, idx: int, write: bool, |
|
|
13 |
sent: str, attrs: List[str], missing: Set[str]): |
|
|
14 |
def map_tok_features(t: FeatureToken) -> Dict[str, Any]: |
|
|
15 |
# sort keys to make diffing easier |
|
|
16 |
dct = t.asdict() |
|
|
17 |
return OrderedDict(sorted(dct.items(), key=lambda t: t[0])) |
|
|
18 |
|
|
|
19 |
actual_file: str = f'test-resources/should/{parser_name}-{idx}.json' |
|
|
20 |
p: FeatureDocumentParser = self._get_doc_parser('combined', parser_name) |
|
|
21 |
doc: FeatureDocument = p(sent) |
|
|
22 |
|
|
|
23 |
actuals = tuple(map(map_tok_features, doc.token_iter())) |
|
|
24 |
if self._TRACE: |
|
|
25 |
from pprint import pprint |
|
|
26 |
pprint(actuals) |
|
|
27 |
return |
|
|
28 |
if write: |
|
|
29 |
print(sent) |
|
|
30 |
with open(actual_file, 'w') as f: |
|
|
31 |
json.dump(actuals, f, indent=4, sort_keys=False) |
|
|
32 |
for attr in attrs: |
|
|
33 |
vals: str = ', '.join(map(lambda d: str(d[attr]), actuals)) |
|
|
34 |
print(f' {attr}: {vals}') |
|
|
35 |
print('_' * 79) |
|
|
36 |
|
|
|
37 |
with open(actual_file) as f: |
|
|
38 |
shoulds: List[Dict[str, Any]] = json.load(f) |
|
|
39 |
|
|
|
40 |
attr: str |
|
|
41 |
for attr in attrs: |
|
|
42 |
actual = tuple(map(lambda d: d[attr], actuals)) |
|
|
43 |
should = tuple(map(lambda d: d[attr], shoulds)) |
|
|
44 |
self.assertEqual(should, actual, f'for attribute: <{attr}>') |
|
|
45 |
|
|
|
46 |
if missing is not None: |
|
|
47 |
for attr in missing: |
|
|
48 |
for tok in actuals: |
|
|
49 |
self.assertFalse(hasattr(tok, attr), |
|
|
50 |
f'expected missing {attr} in {tok}') |
|
|
51 |
|
|
|
52 |
def _compare(self, parser_name: str, write: bool = False, |
|
|
53 |
attrs: List[str] = None, missing: Set[str] = None): |
|
|
54 |
attrs = TestCombinedParsers._DEFAULT_ATTRS if attrs is None else attrs |
|
|
55 |
for i in range(2): |
|
|
56 |
sent: str = getattr(self, f'text_{i + 1}') |
|
|
57 |
self._compare_sents(parser_name, i, write, sent, attrs, missing) |
|
|
58 |
|
|
|
59 |
def test_default(self): |
|
|
60 |
self._compare('doc_parser', missing='cui_'.split()) |
|
|
61 |
|
|
|
62 |
def test_biomed_ner(self): |
|
|
63 |
self._compare('mednlp_biomed_doc_parser', missing='cui_'.split()) |
|
|
64 |
|
|
|
65 |
def test_medcat(self): |
|
|
66 |
self._compare('mednlp_medcat_doc_parser', |
|
|
67 |
attrs=self._DEFAULT_ATTRS + 'cui_ tuis_'.split()) |
|
|
68 |
|
|
|
69 |
def test_biomed_combined(self): |
|
|
70 |
self._compare('mednlp_combine_biomed_doc_parser', |
|
|
71 |
missing='cui_'.split()) |
|
|
72 |
|
|
|
73 |
def test_medcat_combined(self): |
|
|
74 |
self._compare('mednlp_combine_medcat_doc_parser', |
|
|
75 |
attrs=self._DEFAULT_ATTRS + 'cui_ tuis_'.split()) |
|
|
76 |
|
|
|
77 |
def test_medcat_biomded_combined(self): |
|
|
78 |
self._compare('mednlp_combine_biomed_medcat_doc_parser', |
|
|
79 |
attrs=self._DEFAULT_ATTRS + 'cui_ tuis_'.split()) |