--- a +++ b/example/features/features.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python + +"""Demonstrates medical term feature generation from spaCy parsed natural +langauge. + +""" +__author__ = 'Paul Landes' + +from dataclasses import dataclass, field +import itertools as it +import pandas as pd +from zensols.cli import CliHarness ; CliHarness.add_sys_path('src/python') +from zensols.cli import ProgramNameConfigurator +from zensols.nlp import FeatureDocumentParser, FeatureDocument +from zensols.nlp.dataframe import FeatureDataFrameFactory + +DEFAULT_SENT = 'He was diagnosed with kidney failure in the United States.' + + +# the definition of the application class executed from the CLI glue code +@dataclass +class Application(object): + """Demonstrates access to UTS. + + """ + # tell the application not mistake the `doc_parser` as an option when + # generating the online help with the -h option + CLI_META = {'option_excludes': {'doc_parser'}} + + doc_parser: FeatureDocumentParser = field() + """Parses and NER tags medical terms.""" + + def _boundary(self, s: str): + print(''.join(['-' * 5, s, '-' * 5])) + + def dump(self, sent: str): + """Dump all features available to a CSV file.""" + doc: FeatureDocument = self.doc_parser(sent) + df = pd.DataFrame(map(lambda t: t.asdict(), doc.tokens)) + df.to_csv('features.csv') + + def show(self, sent: str = None): + """Parse a sentence and print all features for each token. + + :param sent: the sentence to parse and generate features + + """ + if sent is None: + sent = DEFAULT_SENT + + self._boundary(f'sentence: <{sent}>') + + # parse the text in to a hierarchical langauge data structure + doc: FeatureDocument = self.doc_parser(sent) + print('first three tokens:') + for tok in it.islice(doc.token_iter(), 3): + print(tok.norm) + tok.write_attributes(1, include_type=False) + + # named entities are also stored contiguous tokens at the document + # level + self._boundary('named entities:') + for e in doc.entities: + print(f'{e}: cui={e[0].cui_}') + + # generate a set of features from the document as a Pandas data frame + # and print it + feats = 'idx i norm is_concept cui_ pref_name_ ent_'.split() + fac = FeatureDataFrameFactory(set(feats), feats) + df: pd.DataFrame = fac(doc) + self._boundary('features as a Pandas data frame') + print(df) + + +if (__name__ == '__main__'): + CliHarness( + app_config_resource='features.conf', + app_config_context=ProgramNameConfigurator( + None, default='features').create_section(), + proto_args=['dump', DEFAULT_SENT], + proto_factory_kwargs={'reload_pattern': '^features'}, + ).run()