--- a +++ b/resources/cui2vec.yml @@ -0,0 +1,83 @@ +# description: UMLS CUI embeddings + +## Installer +# +cui2vec_rm_update: + class_name: zensols.install.RemoveUpdate + dry_run: false + paths: + - '{target}/../__MACOSX' + +# The below URL might need to be updated from a new URL given by navigating to +# the embeddings link below +# +# paper: https://arxiv.org/pdf/1804.01486.pdf +# embeddings: https://figshare.com/s/00d69861786cd0156d81 +cui2vec_resource: + class_name: zensols.install.Resource + url: 'https://figshare.com/ndownloader/files/10959626?private_link=00d69861786cd0156d81' + name: 'cui2vec.zip' + is_compressed: true + check_path: 'cui2vec_pretrained.csv' + updates: 'instance: list: cui2vec_rm_update' + # if deleted, it will re-download on each access of the weights + clean_up: false + +cui2vec_installer: + class_name: zensols.install.Installer + package_resource: zensols.mednlp + resources: 'instance: list: cui2vec_resource' + + +## Embedding +# +cui2vec_500_embedding: + class_name: 'zensols.mednlp.cui2vec.Cui2VecEmbedModel' + installer: 'instance: cui2vec_installer' + resource: 'instance: cui2vec_resource' + lowercase: false + +# a vectorizer that turns tokens (TokensContainer) in to indexes given to the +# embedding layer +cui2vec_500_feature_vectorizer: + class_name: zensols.deepnlp.vectorize.WordVectorEmbeddingFeatureVectorizer + # the feature id is used to connect instance data with the vectorizer used to + # generate the feature at run time + feature_id: 'wvcui2vec500' + # encode at the (feature) document level + fold_method: 'concat_tokens' + # the word embedding model + embed_model: 'instance: cui2vec_500_embedding' + # do not serialize (pickle) the decoded output to do the work up front + encode_transformed: '${mednlp_default:cui2vec_encode_transformed}' + # the FeatureToken attribute used to index the embedding vectors + token_feature_id: 'cui_' + +# a torch.nn.Module implementation that uses the an embedding model +cui2vec_500_embedding_layer: + class_name: zensols.deepnlp.layer.WordVectorEmbeddingLayer + embed_model: 'instance: cui2vec_500_embedding' + feature_vectorizer_manager: 'instance: cui2vec_feature_vectorizer_manager' + # freeze the embedding to train faster + trainable: '${mednlp_default:cui2vec_trainable}' + + +## Vectorizer +# +cui2vec_feature_vectorizer_manager: + class_name: zensols.deepnlp.vectorize.FeatureDocumentVectorizerManager + torch_config: 'instance: torch_config' + doc_parser: 'instance: ${mednlp_default:doc_parser}' + # do not truncate tokens + token_length: -1 + configured_vectorizers: 'list: cui2vec_500_feature_vectorizer' + +# cui embeddings +cui2vec_feature_batch_mappings: + manager_mappings: + - vectorizer_manager_name: cui2vec_feature_vectorizer_manager + fields: + - attr: cui2vec_500_embedding + feature_id: ${cui2vec_500_feature_vectorizer:feature_id} + is_agg: true + attr_access: doc