Diff of /resources/cui2vec.yml [000000] .. [ca4dac]

Switch to unified view

a b/resources/cui2vec.yml
1
# description: UMLS CUI embeddings
2
3
## Installer
4
#
5
cui2vec_rm_update:
6
  class_name: zensols.install.RemoveUpdate
7
  dry_run: false
8
  paths:
9
    - '{target}/../__MACOSX'
10
11
# The below URL might need to be updated from a new URL given by navigating to
12
# the embeddings link below
13
#
14
# paper: https://arxiv.org/pdf/1804.01486.pdf
15
# embeddings: https://figshare.com/s/00d69861786cd0156d81
16
cui2vec_resource:
17
  class_name: zensols.install.Resource
18
  url: 'https://figshare.com/ndownloader/files/10959626?private_link=00d69861786cd0156d81'
19
  name: 'cui2vec.zip'
20
  is_compressed: true
21
  check_path: 'cui2vec_pretrained.csv'
22
  updates: 'instance: list: cui2vec_rm_update'
23
  # if deleted, it will re-download on each access of the weights
24
  clean_up: false
25
26
cui2vec_installer:
27
  class_name: zensols.install.Installer
28
  package_resource: zensols.mednlp
29
  resources: 'instance: list: cui2vec_resource'
30
31
32
## Embedding
33
#
34
cui2vec_500_embedding:
35
  class_name: 'zensols.mednlp.cui2vec.Cui2VecEmbedModel'
36
  installer: 'instance: cui2vec_installer'
37
  resource: 'instance: cui2vec_resource'
38
  lowercase: false
39
40
# a vectorizer that turns tokens (TokensContainer) in to indexes given to the
41
# embedding layer
42
cui2vec_500_feature_vectorizer:
43
  class_name: zensols.deepnlp.vectorize.WordVectorEmbeddingFeatureVectorizer
44
  # the feature id is used to connect instance data with the vectorizer used to
45
  # generate the feature at run time
46
  feature_id: 'wvcui2vec500'
47
  # encode at the (feature) document level
48
  fold_method: 'concat_tokens'
49
  # the word embedding model
50
  embed_model: 'instance: cui2vec_500_embedding'
51
  # do not serialize (pickle) the decoded output to do the work up front
52
  encode_transformed: '${mednlp_default:cui2vec_encode_transformed}'
53
  # the FeatureToken attribute used to index the embedding vectors
54
  token_feature_id: 'cui_'
55
56
# a torch.nn.Module implementation that uses the an embedding model
57
cui2vec_500_embedding_layer:
58
  class_name: zensols.deepnlp.layer.WordVectorEmbeddingLayer
59
  embed_model: 'instance: cui2vec_500_embedding'
60
  feature_vectorizer_manager: 'instance: cui2vec_feature_vectorizer_manager'
61
  # freeze the embedding to train faster
62
  trainable: '${mednlp_default:cui2vec_trainable}'
63
64
65
## Vectorizer
66
#
67
cui2vec_feature_vectorizer_manager:
68
  class_name: zensols.deepnlp.vectorize.FeatureDocumentVectorizerManager
69
  torch_config: 'instance: torch_config'
70
  doc_parser: 'instance: ${mednlp_default:doc_parser}'
71
  # do not truncate tokens
72
  token_length: -1
73
  configured_vectorizers: 'list: cui2vec_500_feature_vectorizer'
74
75
# cui embeddings
76
cui2vec_feature_batch_mappings:
77
  manager_mappings:
78
    - vectorizer_manager_name: cui2vec_feature_vectorizer_manager
79
      fields:
80
        - attr: cui2vec_500_embedding
81
          feature_id: ${cui2vec_500_feature_vectorizer:feature_id}
82
          is_agg: true
83
          attr_access: doc