# description: UMLS CUI embeddings
## Installer
#
cui2vec_rm_update:
class_name: zensols.install.RemoveUpdate
dry_run: false
paths:
- '{target}/../__MACOSX'
# The below URL might need to be updated from a new URL given by navigating to
# the embeddings link below
#
# paper: https://arxiv.org/pdf/1804.01486.pdf
# embeddings: https://figshare.com/s/00d69861786cd0156d81
cui2vec_resource:
class_name: zensols.install.Resource
url: 'https://figshare.com/ndownloader/files/10959626?private_link=00d69861786cd0156d81'
name: 'cui2vec.zip'
is_compressed: true
check_path: 'cui2vec_pretrained.csv'
updates: 'instance: list: cui2vec_rm_update'
# if deleted, it will re-download on each access of the weights
clean_up: false
cui2vec_installer:
class_name: zensols.install.Installer
package_resource: zensols.mednlp
resources: 'instance: list: cui2vec_resource'
## Embedding
#
cui2vec_500_embedding:
class_name: 'zensols.mednlp.cui2vec.Cui2VecEmbedModel'
installer: 'instance: cui2vec_installer'
resource: 'instance: cui2vec_resource'
lowercase: false
# a vectorizer that turns tokens (TokensContainer) in to indexes given to the
# embedding layer
cui2vec_500_feature_vectorizer:
class_name: zensols.deepnlp.vectorize.WordVectorEmbeddingFeatureVectorizer
# the feature id is used to connect instance data with the vectorizer used to
# generate the feature at run time
feature_id: 'wvcui2vec500'
# encode at the (feature) document level
fold_method: 'concat_tokens'
# the word embedding model
embed_model: 'instance: cui2vec_500_embedding'
# do not serialize (pickle) the decoded output to do the work up front
encode_transformed: '${mednlp_default:cui2vec_encode_transformed}'
# the FeatureToken attribute used to index the embedding vectors
token_feature_id: 'cui_'
# a torch.nn.Module implementation that uses the an embedding model
cui2vec_500_embedding_layer:
class_name: zensols.deepnlp.layer.WordVectorEmbeddingLayer
embed_model: 'instance: cui2vec_500_embedding'
feature_vectorizer_manager: 'instance: cui2vec_feature_vectorizer_manager'
# freeze the embedding to train faster
trainable: '${mednlp_default:cui2vec_trainable}'
## Vectorizer
#
cui2vec_feature_vectorizer_manager:
class_name: zensols.deepnlp.vectorize.FeatureDocumentVectorizerManager
torch_config: 'instance: torch_config'
doc_parser: 'instance: ${mednlp_default:doc_parser}'
# do not truncate tokens
token_length: -1
configured_vectorizers: 'list: cui2vec_500_feature_vectorizer'
# cui embeddings
cui2vec_feature_batch_mappings:
manager_mappings:
- vectorizer_manager_name: cui2vec_feature_vectorizer_manager
fields:
- attr: cui2vec_500_embedding
feature_id: ${cui2vec_500_feature_vectorizer:feature_id}
is_agg: true
attr_access: doc