mednlp / Git / [ca4dac] /example/features/features.py

Models:
philipB/
mednlp
Downloads: 1
[ca4dac]: / example / features / features.py
History
Download this file
83 lines (63 with data), 2.7 kB

#!/usr/bin/env python

"""Demonstrates medical term feature generation from spaCy parsed natural
langauge.

"""
__author__ = 'Paul Landes'

from dataclasses import dataclass, field
import itertools as it
import pandas as pd
from zensols.cli import CliHarness ; CliHarness.add_sys_path('src/python')
from zensols.cli import ProgramNameConfigurator
from zensols.nlp import FeatureDocumentParser, FeatureDocument
from zensols.nlp.dataframe import FeatureDataFrameFactory

DEFAULT_SENT = 'He was diagnosed with kidney failure in the United States.'


# the definition of the application class executed from the CLI glue code
@dataclass
class Application(object):
    """Demonstrates access to UTS.

    """
    # tell the application not mistake the `doc_parser` as an option when
    # generating the online help with the -h option
    CLI_META = {'option_excludes': {'doc_parser'}}

    doc_parser: FeatureDocumentParser = field()
    """Parses and NER tags medical terms."""

    def _boundary(self, s: str):
        print(''.join(['-' * 5, s, '-' * 5]))

    def dump(self, sent: str):
        """Dump all features available to a CSV file."""
        doc: FeatureDocument = self.doc_parser(sent)
        df = pd.DataFrame(map(lambda t: t.asdict(), doc.tokens))
        df.to_csv('features.csv')

    def show(self, sent: str = None):
        """Parse a sentence and print all features for each token.

        :param sent: the sentence to parse and generate features

        """
        if sent is None:
            sent = DEFAULT_SENT

        self._boundary(f'sentence: <{sent}>')

        # parse the text in to a hierarchical langauge data structure
        doc: FeatureDocument = self.doc_parser(sent)
        print('first three tokens:')
        for tok in it.islice(doc.token_iter(), 3):
            print(tok.norm)
            tok.write_attributes(1, include_type=False)

        # named entities are also stored contiguous tokens at the document
        # level
        self._boundary('named entities:')
        for e in doc.entities:
            print(f'{e}: cui={e[0].cui_}')

        # generate a set of features from the document as a Pandas data frame
        # and print it
        feats = 'idx i norm is_concept cui_ pref_name_ ent_'.split()
        fac = FeatureDataFrameFactory(set(feats), feats)
        df: pd.DataFrame = fac(doc)
        self._boundary('features as a Pandas data frame')
        print(df)


if (__name__ == '__main__'):
    CliHarness(
        app_config_resource='features.conf',
        app_config_context=ProgramNameConfigurator(
            None, default='features').create_section(),
        proto_args=['dump', DEFAULT_SENT],
        proto_factory_kwargs={'reload_pattern': '^features'},
    ).run()