--- a +++ b/example/cui2vec/cui2vec.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python + +"""Demonstrates medical term feature generation from spaCy parsed natural +langauge. + +This example needs the ``zensols.deepnlp`` library, install with:: + + pip install zensols.deepnlp + + + +""" +__author__ = 'Paul Landes' + +from typing import Dict, List, Tuple +from dataclasses import dataclass, field +import logging +from gensim.models.keyedvectors import KeyedVectors +from zensols.cli import CliHarness ; CliHarness.add_sys_path('src/python') +from zensols.cli import ProgramNameConfigurator, ApplicationError +from zensols.mednlp import UTSClient +from zensols.mednlp.cui2vec import Cui2VecEmbedModel + +logger = logging.getLogger(__name__) + + +# the definition of the application class executed from the CLI glue code +@dataclass +class Application(object): + """Demonstrates access to UTS. + + """ + # tell the application not mistake the fields as an option when generating + # the online help with the -h option + CLI_META = {'option_excludes': {'uts_client', 'cui2vec_embedding'}} + + uts_client: UTSClient = field() + """Queries UMLS data.""" + + cui2vec_embedding: Cui2VecEmbedModel = field() + """The cui2vec embedding model.""" + + def __post_init__(self): + # suppress gensim API warnings + import warnings + warnings.filterwarnings( + 'ignore', message='invalid value encountered in true_divide') + + @property + def kv(self) -> KeyedVectors: + embedding: Cui2VecEmbedModel = self.cui2vec_embedding + return embedding.keyed_vectors + + def _search_cui(self, term: str) -> str: + kv: KeyedVectors = self.kv + res: List[Dict[str, str]] = self.uts_client.search_term(term) + cui: str = None + for rd in res: + cui = rd['ui'] + if cui in kv: + logger.info(f"found cui: '{term}' -> {cui}") + break + if cui is None: + raise ApplicationError(f'CUI {cui} not found in cui2vec') + return cui + + def similarity(self, term: str = 'heart disease', topn: int = 5): + """Get the cosine similarity between two CUIs. + + :param term: the medical term + + :param topn: the top N count similarities to return + + """ + kv: KeyedVectors = self.kv + cui: str = self._search_cui(term) + sims_by_word: List[Tuple[str, float]] = kv.similar_by_word(cui, topn) + for rel_cui, proba in sims_by_word: + rel_atom: Dict[str, str] = self.uts_client.get_atoms(rel_cui) + rel_name = rel_atom.get('name', 'Unknown') + logger.info(f'{rel_name} ({rel_cui}): {proba * 100:.2f}%') + + def distance(self, term_a: str, term_b: str) -> float: + """Get the cosine similarity between two CUIs. + + :param term: the medical term + + """ + kv: KeyedVectors = self.kv + cui_a: str = self._search_cui(term_a) + cui_b: str = self._search_cui(term_b) + cos_dis: float = kv.distance(cui_a, cui_b) + logger.info(f'similarity: {cui_a} <-> {cui_b}: {cos_dis}') + return cos_dis + + +if (__name__ == '__main__'): + CliHarness( + app_config_resource='cui2vec.conf', + app_config_context=ProgramNameConfigurator( + None, default='cui2vec').create_section(), + proto_args='', + proto_factory_kwargs={'reload_pattern': '^cui2vec'}, + ).run()