|
a |
|
b/example/features/features.py |
|
|
1 |
#!/usr/bin/env python |
|
|
2 |
|
|
|
3 |
"""Demonstrates medical term feature generation from spaCy parsed natural |
|
|
4 |
langauge. |
|
|
5 |
|
|
|
6 |
""" |
|
|
7 |
__author__ = 'Paul Landes' |
|
|
8 |
|
|
|
9 |
from dataclasses import dataclass, field |
|
|
10 |
import itertools as it |
|
|
11 |
import pandas as pd |
|
|
12 |
from zensols.cli import CliHarness ; CliHarness.add_sys_path('src/python') |
|
|
13 |
from zensols.cli import ProgramNameConfigurator |
|
|
14 |
from zensols.nlp import FeatureDocumentParser, FeatureDocument |
|
|
15 |
from zensols.nlp.dataframe import FeatureDataFrameFactory |
|
|
16 |
|
|
|
17 |
DEFAULT_SENT = 'He was diagnosed with kidney failure in the United States.' |
|
|
18 |
|
|
|
19 |
|
|
|
20 |
# the definition of the application class executed from the CLI glue code |
|
|
21 |
@dataclass |
|
|
22 |
class Application(object): |
|
|
23 |
"""Demonstrates access to UTS. |
|
|
24 |
|
|
|
25 |
""" |
|
|
26 |
# tell the application not mistake the `doc_parser` as an option when |
|
|
27 |
# generating the online help with the -h option |
|
|
28 |
CLI_META = {'option_excludes': {'doc_parser'}} |
|
|
29 |
|
|
|
30 |
doc_parser: FeatureDocumentParser = field() |
|
|
31 |
"""Parses and NER tags medical terms.""" |
|
|
32 |
|
|
|
33 |
def _boundary(self, s: str): |
|
|
34 |
print(''.join(['-' * 5, s, '-' * 5])) |
|
|
35 |
|
|
|
36 |
def dump(self, sent: str): |
|
|
37 |
"""Dump all features available to a CSV file.""" |
|
|
38 |
doc: FeatureDocument = self.doc_parser(sent) |
|
|
39 |
df = pd.DataFrame(map(lambda t: t.asdict(), doc.tokens)) |
|
|
40 |
df.to_csv('features.csv') |
|
|
41 |
|
|
|
42 |
def show(self, sent: str = None): |
|
|
43 |
"""Parse a sentence and print all features for each token. |
|
|
44 |
|
|
|
45 |
:param sent: the sentence to parse and generate features |
|
|
46 |
|
|
|
47 |
""" |
|
|
48 |
if sent is None: |
|
|
49 |
sent = DEFAULT_SENT |
|
|
50 |
|
|
|
51 |
self._boundary(f'sentence: <{sent}>') |
|
|
52 |
|
|
|
53 |
# parse the text in to a hierarchical langauge data structure |
|
|
54 |
doc: FeatureDocument = self.doc_parser(sent) |
|
|
55 |
print('first three tokens:') |
|
|
56 |
for tok in it.islice(doc.token_iter(), 3): |
|
|
57 |
print(tok.norm) |
|
|
58 |
tok.write_attributes(1, include_type=False) |
|
|
59 |
|
|
|
60 |
# named entities are also stored contiguous tokens at the document |
|
|
61 |
# level |
|
|
62 |
self._boundary('named entities:') |
|
|
63 |
for e in doc.entities: |
|
|
64 |
print(f'{e}: cui={e[0].cui_}') |
|
|
65 |
|
|
|
66 |
# generate a set of features from the document as a Pandas data frame |
|
|
67 |
# and print it |
|
|
68 |
feats = 'idx i norm is_concept cui_ pref_name_ ent_'.split() |
|
|
69 |
fac = FeatureDataFrameFactory(set(feats), feats) |
|
|
70 |
df: pd.DataFrame = fac(doc) |
|
|
71 |
self._boundary('features as a Pandas data frame') |
|
|
72 |
print(df) |
|
|
73 |
|
|
|
74 |
|
|
|
75 |
if (__name__ == '__main__'): |
|
|
76 |
CliHarness( |
|
|
77 |
app_config_resource='features.conf', |
|
|
78 |
app_config_context=ProgramNameConfigurator( |
|
|
79 |
None, default='features').create_section(), |
|
|
80 |
proto_args=['dump', DEFAULT_SENT], |
|
|
81 |
proto_factory_kwargs={'reload_pattern': '^features'}, |
|
|
82 |
).run() |