[ca4dac]: / resources / cui2vec.yml

Download this file

84 lines (73 with data), 2.8 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# description: UMLS CUI embeddings
## Installer
#
cui2vec_rm_update:
class_name: zensols.install.RemoveUpdate
dry_run: false
paths:
- '{target}/../__MACOSX'
# The below URL might need to be updated from a new URL given by navigating to
# the embeddings link below
#
# paper: https://arxiv.org/pdf/1804.01486.pdf
# embeddings: https://figshare.com/s/00d69861786cd0156d81
cui2vec_resource:
class_name: zensols.install.Resource
url: 'https://figshare.com/ndownloader/files/10959626?private_link=00d69861786cd0156d81'
name: 'cui2vec.zip'
is_compressed: true
check_path: 'cui2vec_pretrained.csv'
updates: 'instance: list: cui2vec_rm_update'
# if deleted, it will re-download on each access of the weights
clean_up: false
cui2vec_installer:
class_name: zensols.install.Installer
package_resource: zensols.mednlp
resources: 'instance: list: cui2vec_resource'
## Embedding
#
cui2vec_500_embedding:
class_name: 'zensols.mednlp.cui2vec.Cui2VecEmbedModel'
installer: 'instance: cui2vec_installer'
resource: 'instance: cui2vec_resource'
lowercase: false
# a vectorizer that turns tokens (TokensContainer) in to indexes given to the
# embedding layer
cui2vec_500_feature_vectorizer:
class_name: zensols.deepnlp.vectorize.WordVectorEmbeddingFeatureVectorizer
# the feature id is used to connect instance data with the vectorizer used to
# generate the feature at run time
feature_id: 'wvcui2vec500'
# encode at the (feature) document level
fold_method: 'concat_tokens'
# the word embedding model
embed_model: 'instance: cui2vec_500_embedding'
# do not serialize (pickle) the decoded output to do the work up front
encode_transformed: '${mednlp_default:cui2vec_encode_transformed}'
# the FeatureToken attribute used to index the embedding vectors
token_feature_id: 'cui_'
# a torch.nn.Module implementation that uses the an embedding model
cui2vec_500_embedding_layer:
class_name: zensols.deepnlp.layer.WordVectorEmbeddingLayer
embed_model: 'instance: cui2vec_500_embedding'
feature_vectorizer_manager: 'instance: cui2vec_feature_vectorizer_manager'
# freeze the embedding to train faster
trainable: '${mednlp_default:cui2vec_trainable}'
## Vectorizer
#
cui2vec_feature_vectorizer_manager:
class_name: zensols.deepnlp.vectorize.FeatureDocumentVectorizerManager
torch_config: 'instance: torch_config'
doc_parser: 'instance: ${mednlp_default:doc_parser}'
# do not truncate tokens
token_length: -1
configured_vectorizers: 'list: cui2vec_500_feature_vectorizer'
# cui embeddings
cui2vec_feature_batch_mappings:
manager_mappings:
- vectorizer_manager_name: cui2vec_feature_vectorizer_manager
fields:
- attr: cui2vec_500_embedding
feature_id: ${cui2vec_500_feature_vectorizer:feature_id}
is_agg: true
attr_access: doc