a b/tests/training/qlf_config.yml
1
# 🤖 PIPELINE DEFINITION
2
nlp:
3
  "@core": pipeline
4
5
  lang: eds
6
7
  components:
8
    normalizer:
9
      '@factory': eds.normalizer
10
11
    sentencizer:
12
      '@factory': eds.sentences
13
14
    covid:
15
      '@factory': eds.covid
16
17
    qualifier:
18
      '@factory': eds.span_classifier
19
      attributes: { "_.negation": [ "sosy" ] }
20
      span_getter: ["ents", "gold_spans"]
21
      context_getter: { '@misc': eds.span_context_getter, "context_words": 30, "context_sents": 1 }
22
23
      embedding:
24
        '@factory': eds.span_pooler
25
26
        embedding:
27
          '@factory': eds.text_cnn
28
          kernel_sizes: [ 3 ]
29
30
          embedding:
31
            '@factory': eds.transformer
32
            model: hf-internal-testing/tiny-bert
33
            window: 128
34
            stride: 96
35
36
# 📈 SCORERS
37
scorer:
38
  speed: true
39
  qual:
40
    '@metrics': "eds.span_attributes"
41
    span_getter: ${nlp.components.qualifier.span_getter}
42
    qualifiers: ${nlp.components.qualifier.attributes}
43
44
# 🎛️ OPTIMIZER
45
# (disabled to test the default optimizer)
46
# optimizer:
47
#   "@optimizers": adam
48
#   groups:
49
#     "*.transformer.*":
50
#       lr: 1e-3
51
#       schedules:
52
#         "@schedules": linear
53
#         "warmup_rate": 0.1
54
#         "start_value": 0
55
#     "*":
56
#       lr: 1e-3
57
#       schedules:
58
#         "@schedules": linear
59
#         "warmup_rate": 0.1
60
#         "start_value": 1e-3
61
62
# 📚 DATA
63
train_data:
64
  data:
65
    "@readers": json
66
    path: ./dataset.jsonl
67
    converter:
68
      - '@factory': 'myproject.custom_dict2doc'
69
        span_setter : 'gold_spans'
70
        span_attributes : ['negation']
71
        bool_attributes : ['negation']  # default json to doc converter
72
      - '@factory': eds.sentences
73
        nlp: ${nlp}
74
      # - '@factory': eds.split
75
      #   nlp: null
76
      #   max_length: 10
77
      #   randomize: 0.3
78
      #   # sentence regex:
79
      #   regex: '\\s*(?:\\n\\s*)+()[A-Z]|[.!?]\\s+()[A-Z]'
80
        # regex: '\\n{2,}'
81
  shuffle: dataset
82
  batch_size: 4 docs
83
  pipe_names: [ "qualifier" ]
84
  sub_batch_size: 10 words
85
86
val_data:
87
  "@readers": json
88
  path: ./dataset.jsonl
89
  converter:
90
    - '@factory': myproject.custom_dict2doc
91
      span_setter : 'gold_spans'
92
      span_attributes : ['negation']
93
      bool_attributes : ['negation']  # default standoff to doc converter
94
95
# 🚀 TRAIN SCRIPT OPTIONS
96
train:
97
  nlp: ${ nlp }
98
  train_data: ${ train_data }
99
  val_data: ${ val_data }
100
  max_steps: 40
101
  validation_interval: 10
102
  max_grad_norm: 1.0
103
  scorer: ${ scorer }
104
  num_workers: 1