Switch to unified view

a b/example/features/features.py
1
#!/usr/bin/env python
2
3
"""Demonstrates medical term feature generation from spaCy parsed natural
4
langauge.
5
6
"""
7
__author__ = 'Paul Landes'
8
9
from dataclasses import dataclass, field
10
import itertools as it
11
import pandas as pd
12
from zensols.cli import CliHarness ; CliHarness.add_sys_path('src/python')
13
from zensols.cli import ProgramNameConfigurator
14
from zensols.nlp import FeatureDocumentParser, FeatureDocument
15
from zensols.nlp.dataframe import FeatureDataFrameFactory
16
17
DEFAULT_SENT = 'He was diagnosed with kidney failure in the United States.'
18
19
20
# the definition of the application class executed from the CLI glue code
21
@dataclass
22
class Application(object):
23
    """Demonstrates access to UTS.
24
25
    """
26
    # tell the application not mistake the `doc_parser` as an option when
27
    # generating the online help with the -h option
28
    CLI_META = {'option_excludes': {'doc_parser'}}
29
30
    doc_parser: FeatureDocumentParser = field()
31
    """Parses and NER tags medical terms."""
32
33
    def _boundary(self, s: str):
34
        print(''.join(['-' * 5, s, '-' * 5]))
35
36
    def dump(self, sent: str):
37
        """Dump all features available to a CSV file."""
38
        doc: FeatureDocument = self.doc_parser(sent)
39
        df = pd.DataFrame(map(lambda t: t.asdict(), doc.tokens))
40
        df.to_csv('features.csv')
41
42
    def show(self, sent: str = None):
43
        """Parse a sentence and print all features for each token.
44
45
        :param sent: the sentence to parse and generate features
46
47
        """
48
        if sent is None:
49
            sent = DEFAULT_SENT
50
51
        self._boundary(f'sentence: <{sent}>')
52
53
        # parse the text in to a hierarchical langauge data structure
54
        doc: FeatureDocument = self.doc_parser(sent)
55
        print('first three tokens:')
56
        for tok in it.islice(doc.token_iter(), 3):
57
            print(tok.norm)
58
            tok.write_attributes(1, include_type=False)
59
60
        # named entities are also stored contiguous tokens at the document
61
        # level
62
        self._boundary('named entities:')
63
        for e in doc.entities:
64
            print(f'{e}: cui={e[0].cui_}')
65
66
        # generate a set of features from the document as a Pandas data frame
67
        # and print it
68
        feats = 'idx i norm is_concept cui_ pref_name_ ent_'.split()
69
        fac = FeatureDataFrameFactory(set(feats), feats)
70
        df: pd.DataFrame = fac(doc)
71
        self._boundary('features as a Pandas data frame')
72
        print(df)
73
74
75
if (__name__ == '__main__'):
76
    CliHarness(
77
        app_config_resource='features.conf',
78
        app_config_context=ProgramNameConfigurator(
79
            None, default='features').create_section(),
80
        proto_args=['dump', DEFAULT_SENT],
81
        proto_factory_kwargs={'reload_pattern': '^features'},
82
    ).run()