[ca4dac]: / example / features / features.py

Download this file

83 lines (63 with data), 2.7 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
"""Demonstrates medical term feature generation from spaCy parsed natural
langauge.
"""
__author__ = 'Paul Landes'
from dataclasses import dataclass, field
import itertools as it
import pandas as pd
from zensols.cli import CliHarness ; CliHarness.add_sys_path('src/python')
from zensols.cli import ProgramNameConfigurator
from zensols.nlp import FeatureDocumentParser, FeatureDocument
from zensols.nlp.dataframe import FeatureDataFrameFactory
DEFAULT_SENT = 'He was diagnosed with kidney failure in the United States.'
# the definition of the application class executed from the CLI glue code
@dataclass
class Application(object):
"""Demonstrates access to UTS.
"""
# tell the application not mistake the `doc_parser` as an option when
# generating the online help with the -h option
CLI_META = {'option_excludes': {'doc_parser'}}
doc_parser: FeatureDocumentParser = field()
"""Parses and NER tags medical terms."""
def _boundary(self, s: str):
print(''.join(['-' * 5, s, '-' * 5]))
def dump(self, sent: str):
"""Dump all features available to a CSV file."""
doc: FeatureDocument = self.doc_parser(sent)
df = pd.DataFrame(map(lambda t: t.asdict(), doc.tokens))
df.to_csv('features.csv')
def show(self, sent: str = None):
"""Parse a sentence and print all features for each token.
:param sent: the sentence to parse and generate features
"""
if sent is None:
sent = DEFAULT_SENT
self._boundary(f'sentence: <{sent}>')
# parse the text in to a hierarchical langauge data structure
doc: FeatureDocument = self.doc_parser(sent)
print('first three tokens:')
for tok in it.islice(doc.token_iter(), 3):
print(tok.norm)
tok.write_attributes(1, include_type=False)
# named entities are also stored contiguous tokens at the document
# level
self._boundary('named entities:')
for e in doc.entities:
print(f'{e}: cui={e[0].cui_}')
# generate a set of features from the document as a Pandas data frame
# and print it
feats = 'idx i norm is_concept cui_ pref_name_ ent_'.split()
fac = FeatureDataFrameFactory(set(feats), feats)
df: pd.DataFrame = fac(doc)
self._boundary('features as a Pandas data frame')
print(df)
if (__name__ == '__main__'):
CliHarness(
app_config_resource='features.conf',
app_config_context=ProgramNameConfigurator(
None, default='features').create_section(),
proto_args=['dump', DEFAULT_SENT],
proto_factory_kwargs={'reload_pattern': '^features'},
).run()