Switch to unified view

a b/example/cui2vec/cui2vec.py
1
#!/usr/bin/env python
2
3
"""Demonstrates medical term feature generation from spaCy parsed natural
4
langauge.
5
6
This example needs the ``zensols.deepnlp`` library, install with::
7
8
    pip install zensols.deepnlp
9
10
11
12
"""
13
__author__ = 'Paul Landes'
14
15
from typing import Dict, List, Tuple
16
from dataclasses import dataclass, field
17
import logging
18
from gensim.models.keyedvectors import KeyedVectors
19
from zensols.cli import CliHarness ; CliHarness.add_sys_path('src/python')
20
from zensols.cli import ProgramNameConfigurator, ApplicationError
21
from zensols.mednlp import UTSClient
22
from zensols.mednlp.cui2vec import Cui2VecEmbedModel
23
24
logger = logging.getLogger(__name__)
25
26
27
# the definition of the application class executed from the CLI glue code
28
@dataclass
29
class Application(object):
30
    """Demonstrates access to UTS.
31
32
    """
33
    # tell the application not mistake the fields as an option when generating
34
    # the online help with the -h option
35
    CLI_META = {'option_excludes': {'uts_client', 'cui2vec_embedding'}}
36
37
    uts_client: UTSClient = field()
38
    """Queries UMLS data."""
39
40
    cui2vec_embedding: Cui2VecEmbedModel = field()
41
    """The cui2vec embedding model."""
42
43
    def __post_init__(self):
44
        # suppress gensim API warnings
45
        import warnings
46
        warnings.filterwarnings(
47
            'ignore', message='invalid value encountered in true_divide')
48
49
    @property
50
    def kv(self) -> KeyedVectors:
51
        embedding: Cui2VecEmbedModel = self.cui2vec_embedding
52
        return embedding.keyed_vectors
53
54
    def _search_cui(self, term: str) -> str:
55
        kv: KeyedVectors = self.kv
56
        res: List[Dict[str, str]] = self.uts_client.search_term(term)
57
        cui: str = None
58
        for rd in res:
59
            cui = rd['ui']
60
            if cui in kv:
61
                logger.info(f"found cui: '{term}' -> {cui}")
62
                break
63
        if cui is None:
64
            raise ApplicationError(f'CUI {cui} not found in cui2vec')
65
        return cui
66
67
    def similarity(self, term: str = 'heart disease', topn: int = 5):
68
        """Get the cosine similarity between two CUIs.
69
70
        :param term: the medical term
71
72
        :param topn: the top N count similarities to return
73
74
        """
75
        kv: KeyedVectors = self.kv
76
        cui: str = self._search_cui(term)
77
        sims_by_word: List[Tuple[str, float]] = kv.similar_by_word(cui, topn)
78
        for rel_cui, proba in sims_by_word:
79
            rel_atom: Dict[str, str] = self.uts_client.get_atoms(rel_cui)
80
            rel_name = rel_atom.get('name', 'Unknown')
81
            logger.info(f'{rel_name} ({rel_cui}): {proba * 100:.2f}%')
82
83
    def distance(self, term_a: str, term_b: str) -> float:
84
        """Get the cosine similarity between two CUIs.
85
86
        :param term: the medical term
87
88
        """
89
        kv: KeyedVectors = self.kv
90
        cui_a: str = self._search_cui(term_a)
91
        cui_b: str = self._search_cui(term_b)
92
        cos_dis: float = kv.distance(cui_a, cui_b)
93
        logger.info(f'similarity: {cui_a} <-> {cui_b}: {cos_dis}')
94
        return cos_dis
95
96
97
if (__name__ == '__main__'):
98
    CliHarness(
99
        app_config_resource='cui2vec.conf',
100
        app_config_context=ProgramNameConfigurator(
101
            None, default='cui2vec').create_section(),
102
        proto_args='',
103
        proto_factory_kwargs={'reload_pattern': '^cui2vec'},
104
    ).run()