a b/tests/tuning/config.yml
1
# My usefull comment
2
# 🤖 PIPELINE DEFINITION
3
nlp:
4
  "@core": pipeline
5
  lang: eds
6
7
  components:
8
    normalizer:
9
      '@factory': eds.normalizer
10
11
    sentencizer:
12
      '@factory': eds.sentences
13
14
    ner:
15
      '@factory': eds.ner_crf
16
      mode: "joint"
17
      target_span_getter: "gold_spans"
18
      # Set spans as both to ents and in separate `ent.label` groups
19
      span_setter: [ "ents", "*" ]
20
      infer_span_setter: true
21
22
      embedding:
23
        '@factory': eds.text_cnn
24
        kernel_sizes: [ 3 ]
25
26
        embedding:
27
          '@factory': eds.transformer
28
          model: hf-internal-testing/tiny-bert
29
          window: 128
30
          stride: 96
31
          new_tokens: [ [ "(?:\\n\\s*)*\\n", "⏎" ] ]
32
33
    qualifier:
34
      '@factory': eds.span_classifier
35
      attributes: { "_.negation": [ "sosy" ], "_.unit": [ "measure" ] }
36
      span_getter: ["ents", "gold_spans"]
37
38
      embedding:
39
        '@factory': eds.span_pooler
40
41
        embedding: # ${ nlp.components.ner.embedding }
42
          '@factory': eds.text_cnn
43
          kernel_sizes: [ 3 ]
44
45
          embedding:
46
            '@factory': eds.transformer
47
            model: hf-internal-testing/tiny-bert
48
            window: 128
49
            stride: 96
50
51
# 📈 SCORERS
52
scorer:
53
  speed: true
54
  qual:
55
    '@metrics': eds.span_attributes
56
    span_getter: ${nlp.components.qualifier.span_getter}
57
    qualifiers: ${nlp.components.qualifier.attributes}
58
  ner:
59
    '@metrics': eds.ner_exact
60
    span_getter: ${nlp.components.ner.target_span_getter}
61
62
# 🎛️ OPTIMIZER
63
optimizer:
64
  "@core": optimizer
65
  optim: AdamW
66
  module: ${ nlp }
67
  groups:
68
    "^transformer": false
69
    ".*":
70
      lr:
71
          "@schedules": linear
72
          start_value: 1e-3
73
          max_value: 2e-3
74
          warmup_rate: 0.5
75
  total_steps: ${ train.max_steps }
76
77
# 📚 DATA
78
train_data:
79
  - data:
80
      '@readers': standoff
81
      path: tests/training/dataset/
82
      converter:
83
        - '@factory': eds.standoff_dict2doc
84
          span_setter : 'gold_spans'
85
          span_attributes : ['sosy', 'unit', 'negation']
86
          bool_attributes : ['negation']  # default standoff to doc converter
87
        - '@factory': eds.sentences
88
          nlp: ${nlp}
89
        - '@factory': eds.split
90
          nlp: null
91
          max_length: 2000
92
          regex: '\n\n+'
93
    shuffle: dataset
94
    batch_size: 8 docs
95
    pipe_names: [ "ner" ]
96
  - data:
97
      '@readers': standoff
98
      path: tests/training/dataset/
99
      converter:
100
        - '@factory': eds.standoff_dict2doc
101
          span_setter : 'gold_spans'
102
          span_attributes : ['sosy', 'unit', 'negation']
103
          bool_attributes : ['negation']  # default standoff to doc converter
104
    shuffle: dataset
105
    batch_size: 16 spans
106
    pipe_names: [ "qualifier" ]
107
108
val_data:
109
  '@readers': standoff
110
  path: tests/training/dataset/
111
  converter:
112
    - '@factory': eds.standoff_dict2doc
113
      span_setter : 'gold_spans'
114
      span_attributes : ['sosy', 'unit', 'negation']
115
      bool_attributes : ['negation']  # default standoff to doc converter
116
117
# 🚀 TRAIN SCRIPT OPTIONS
118
train:
119
  nlp: ${ nlp }
120
  train_data: ${ train_data }
121
  val_data: ${ val_data }
122
  max_steps: 5
123
  validation_interval: 2
124
  max_grad_norm: 1.0
125
  scorer: ${ scorer }
126
  num_workers: 0
127
  optimizer: ${ optimizer }
128
  cpu: true