mednlp / Git / [ca4dac] /resources/cui2vec.yml

Models:
philipB/
mednlp
Downloads: 1
[ca4dac]: / resources / cui2vec.yml
History
Download this file
84 lines (73 with data), 2.8 kB

# description: UMLS CUI embeddings

## Installer
#
cui2vec_rm_update:
  class_name: zensols.install.RemoveUpdate
  dry_run: false
  paths:
    - '{target}/../__MACOSX'

# The below URL might need to be updated from a new URL given by navigating to
# the embeddings link below
#
# paper: https://arxiv.org/pdf/1804.01486.pdf
# embeddings: https://figshare.com/s/00d69861786cd0156d81
cui2vec_resource:
  class_name: zensols.install.Resource
  url: 'https://figshare.com/ndownloader/files/10959626?private_link=00d69861786cd0156d81'
  name: 'cui2vec.zip'
  is_compressed: true
  check_path: 'cui2vec_pretrained.csv'
  updates: 'instance: list: cui2vec_rm_update'
  # if deleted, it will re-download on each access of the weights
  clean_up: false

cui2vec_installer:
  class_name: zensols.install.Installer
  package_resource: zensols.mednlp
  resources: 'instance: list: cui2vec_resource'


## Embedding
#
cui2vec_500_embedding:
  class_name: 'zensols.mednlp.cui2vec.Cui2VecEmbedModel'
  installer: 'instance: cui2vec_installer'
  resource: 'instance: cui2vec_resource'
  lowercase: false

# a vectorizer that turns tokens (TokensContainer) in to indexes given to the
# embedding layer
cui2vec_500_feature_vectorizer:
  class_name: zensols.deepnlp.vectorize.WordVectorEmbeddingFeatureVectorizer
  # the feature id is used to connect instance data with the vectorizer used to
  # generate the feature at run time
  feature_id: 'wvcui2vec500'
  # encode at the (feature) document level
  fold_method: 'concat_tokens'
  # the word embedding model
  embed_model: 'instance: cui2vec_500_embedding'
  # do not serialize (pickle) the decoded output to do the work up front
  encode_transformed: '${mednlp_default:cui2vec_encode_transformed}'
  # the FeatureToken attribute used to index the embedding vectors
  token_feature_id: 'cui_'

# a torch.nn.Module implementation that uses the an embedding model
cui2vec_500_embedding_layer:
  class_name: zensols.deepnlp.layer.WordVectorEmbeddingLayer
  embed_model: 'instance: cui2vec_500_embedding'
  feature_vectorizer_manager: 'instance: cui2vec_feature_vectorizer_manager'
  # freeze the embedding to train faster
  trainable: '${mednlp_default:cui2vec_trainable}'


## Vectorizer
#
cui2vec_feature_vectorizer_manager:
  class_name: zensols.deepnlp.vectorize.FeatureDocumentVectorizerManager
  torch_config: 'instance: torch_config'
  doc_parser: 'instance: ${mednlp_default:doc_parser}'
  # do not truncate tokens
  token_length: -1
  configured_vectorizers: 'list: cui2vec_500_feature_vectorizer'

# cui embeddings
cui2vec_feature_batch_mappings:
  manager_mappings:
    - vectorizer_manager_name: cui2vec_feature_vectorizer_manager
      fields:
        - attr: cui2vec_500_embedding
          feature_id: ${cui2vec_500_feature_vectorizer:feature_id}
          is_agg: true
          attr_access: doc