mednlp / Git / Diff of /example/features/features.py

Models:
philipB/
mednlp
Downloads: 1
Diff of /example/features/features.py [000000] .. [ca4dac]
Switch to side-by-side view

--- a
+++ b/example/features/features.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+"""Demonstrates medical term feature generation from spaCy parsed natural
+langauge.
+
+"""
+__author__ = 'Paul Landes'
+
+from dataclasses import dataclass, field
+import itertools as it
+import pandas as pd
+from zensols.cli import CliHarness ; CliHarness.add_sys_path('src/python')
+from zensols.cli import ProgramNameConfigurator
+from zensols.nlp import FeatureDocumentParser, FeatureDocument
+from zensols.nlp.dataframe import FeatureDataFrameFactory
+
+DEFAULT_SENT = 'He was diagnosed with kidney failure in the United States.'
+
+
+# the definition of the application class executed from the CLI glue code
+@dataclass
+class Application(object):
+    """Demonstrates access to UTS.
+
+    """
+    # tell the application not mistake the `doc_parser` as an option when
+    # generating the online help with the -h option
+    CLI_META = {'option_excludes': {'doc_parser'}}
+
+    doc_parser: FeatureDocumentParser = field()
+    """Parses and NER tags medical terms."""
+
+    def _boundary(self, s: str):
+        print(''.join(['-' * 5, s, '-' * 5]))
+
+    def dump(self, sent: str):
+        """Dump all features available to a CSV file."""
+        doc: FeatureDocument = self.doc_parser(sent)
+        df = pd.DataFrame(map(lambda t: t.asdict(), doc.tokens))
+        df.to_csv('features.csv')
+
+    def show(self, sent: str = None):
+        """Parse a sentence and print all features for each token.
+
+        :param sent: the sentence to parse and generate features
+
+        """
+        if sent is None:
+            sent = DEFAULT_SENT
+
+        self._boundary(f'sentence: <{sent}>')
+
+        # parse the text in to a hierarchical langauge data structure
+        doc: FeatureDocument = self.doc_parser(sent)
+        print('first three tokens:')
+        for tok in it.islice(doc.token_iter(), 3):
+            print(tok.norm)
+            tok.write_attributes(1, include_type=False)
+
+        # named entities are also stored contiguous tokens at the document
+        # level
+        self._boundary('named entities:')
+        for e in doc.entities:
+            print(f'{e}: cui={e[0].cui_}')
+
+        # generate a set of features from the document as a Pandas data frame
+        # and print it
+        feats = 'idx i norm is_concept cui_ pref_name_ ent_'.split()
+        fac = FeatureDataFrameFactory(set(feats), feats)
+        df: pd.DataFrame = fac(doc)
+        self._boundary('features as a Pandas data frame')
+        print(df)
+
+
+if (__name__ == '__main__'):
+    CliHarness(
+        app_config_resource='features.conf',
+        app_config_context=ProgramNameConfigurator(
+            None, default='features').create_section(),
+        proto_args=['dump', DEFAULT_SENT],
+        proto_factory_kwargs={'reload_pattern': '^features'},
+    ).run()