|
a |
|
b/resources/cui2vec.yml |
|
|
1 |
# description: UMLS CUI embeddings |
|
|
2 |
|
|
|
3 |
## Installer |
|
|
4 |
# |
|
|
5 |
cui2vec_rm_update: |
|
|
6 |
class_name: zensols.install.RemoveUpdate |
|
|
7 |
dry_run: false |
|
|
8 |
paths: |
|
|
9 |
- '{target}/../__MACOSX' |
|
|
10 |
|
|
|
11 |
# The below URL might need to be updated from a new URL given by navigating to |
|
|
12 |
# the embeddings link below |
|
|
13 |
# |
|
|
14 |
# paper: https://arxiv.org/pdf/1804.01486.pdf |
|
|
15 |
# embeddings: https://figshare.com/s/00d69861786cd0156d81 |
|
|
16 |
cui2vec_resource: |
|
|
17 |
class_name: zensols.install.Resource |
|
|
18 |
url: 'https://figshare.com/ndownloader/files/10959626?private_link=00d69861786cd0156d81' |
|
|
19 |
name: 'cui2vec.zip' |
|
|
20 |
is_compressed: true |
|
|
21 |
check_path: 'cui2vec_pretrained.csv' |
|
|
22 |
updates: 'instance: list: cui2vec_rm_update' |
|
|
23 |
# if deleted, it will re-download on each access of the weights |
|
|
24 |
clean_up: false |
|
|
25 |
|
|
|
26 |
cui2vec_installer: |
|
|
27 |
class_name: zensols.install.Installer |
|
|
28 |
package_resource: zensols.mednlp |
|
|
29 |
resources: 'instance: list: cui2vec_resource' |
|
|
30 |
|
|
|
31 |
|
|
|
32 |
## Embedding |
|
|
33 |
# |
|
|
34 |
cui2vec_500_embedding: |
|
|
35 |
class_name: 'zensols.mednlp.cui2vec.Cui2VecEmbedModel' |
|
|
36 |
installer: 'instance: cui2vec_installer' |
|
|
37 |
resource: 'instance: cui2vec_resource' |
|
|
38 |
lowercase: false |
|
|
39 |
|
|
|
40 |
# a vectorizer that turns tokens (TokensContainer) in to indexes given to the |
|
|
41 |
# embedding layer |
|
|
42 |
cui2vec_500_feature_vectorizer: |
|
|
43 |
class_name: zensols.deepnlp.vectorize.WordVectorEmbeddingFeatureVectorizer |
|
|
44 |
# the feature id is used to connect instance data with the vectorizer used to |
|
|
45 |
# generate the feature at run time |
|
|
46 |
feature_id: 'wvcui2vec500' |
|
|
47 |
# encode at the (feature) document level |
|
|
48 |
fold_method: 'concat_tokens' |
|
|
49 |
# the word embedding model |
|
|
50 |
embed_model: 'instance: cui2vec_500_embedding' |
|
|
51 |
# do not serialize (pickle) the decoded output to do the work up front |
|
|
52 |
encode_transformed: '${mednlp_default:cui2vec_encode_transformed}' |
|
|
53 |
# the FeatureToken attribute used to index the embedding vectors |
|
|
54 |
token_feature_id: 'cui_' |
|
|
55 |
|
|
|
56 |
# a torch.nn.Module implementation that uses the an embedding model |
|
|
57 |
cui2vec_500_embedding_layer: |
|
|
58 |
class_name: zensols.deepnlp.layer.WordVectorEmbeddingLayer |
|
|
59 |
embed_model: 'instance: cui2vec_500_embedding' |
|
|
60 |
feature_vectorizer_manager: 'instance: cui2vec_feature_vectorizer_manager' |
|
|
61 |
# freeze the embedding to train faster |
|
|
62 |
trainable: '${mednlp_default:cui2vec_trainable}' |
|
|
63 |
|
|
|
64 |
|
|
|
65 |
## Vectorizer |
|
|
66 |
# |
|
|
67 |
cui2vec_feature_vectorizer_manager: |
|
|
68 |
class_name: zensols.deepnlp.vectorize.FeatureDocumentVectorizerManager |
|
|
69 |
torch_config: 'instance: torch_config' |
|
|
70 |
doc_parser: 'instance: ${mednlp_default:doc_parser}' |
|
|
71 |
# do not truncate tokens |
|
|
72 |
token_length: -1 |
|
|
73 |
configured_vectorizers: 'list: cui2vec_500_feature_vectorizer' |
|
|
74 |
|
|
|
75 |
# cui embeddings |
|
|
76 |
cui2vec_feature_batch_mappings: |
|
|
77 |
manager_mappings: |
|
|
78 |
- vectorizer_manager_name: cui2vec_feature_vectorizer_manager |
|
|
79 |
fields: |
|
|
80 |
- attr: cui2vec_500_embedding |
|
|
81 |
feature_id: ${cui2vec_500_feature_vectorizer:feature_id} |
|
|
82 |
is_agg: true |
|
|
83 |
attr_access: doc |