|
a |
|
b/tests/training/qlf_config.yml |
|
|
1 |
# 🤖 PIPELINE DEFINITION |
|
|
2 |
nlp: |
|
|
3 |
"@core": pipeline |
|
|
4 |
|
|
|
5 |
lang: eds |
|
|
6 |
|
|
|
7 |
components: |
|
|
8 |
normalizer: |
|
|
9 |
'@factory': eds.normalizer |
|
|
10 |
|
|
|
11 |
sentencizer: |
|
|
12 |
'@factory': eds.sentences |
|
|
13 |
|
|
|
14 |
covid: |
|
|
15 |
'@factory': eds.covid |
|
|
16 |
|
|
|
17 |
qualifier: |
|
|
18 |
'@factory': eds.span_classifier |
|
|
19 |
attributes: { "_.negation": [ "sosy" ] } |
|
|
20 |
span_getter: ["ents", "gold_spans"] |
|
|
21 |
context_getter: { '@misc': eds.span_context_getter, "context_words": 30, "context_sents": 1 } |
|
|
22 |
|
|
|
23 |
embedding: |
|
|
24 |
'@factory': eds.span_pooler |
|
|
25 |
|
|
|
26 |
embedding: |
|
|
27 |
'@factory': eds.text_cnn |
|
|
28 |
kernel_sizes: [ 3 ] |
|
|
29 |
|
|
|
30 |
embedding: |
|
|
31 |
'@factory': eds.transformer |
|
|
32 |
model: hf-internal-testing/tiny-bert |
|
|
33 |
window: 128 |
|
|
34 |
stride: 96 |
|
|
35 |
|
|
|
36 |
# 📈 SCORERS |
|
|
37 |
scorer: |
|
|
38 |
speed: true |
|
|
39 |
qual: |
|
|
40 |
'@metrics': "eds.span_attributes" |
|
|
41 |
span_getter: ${nlp.components.qualifier.span_getter} |
|
|
42 |
qualifiers: ${nlp.components.qualifier.attributes} |
|
|
43 |
|
|
|
44 |
# 🎛️ OPTIMIZER |
|
|
45 |
# (disabled to test the default optimizer) |
|
|
46 |
# optimizer: |
|
|
47 |
# "@optimizers": adam |
|
|
48 |
# groups: |
|
|
49 |
# "*.transformer.*": |
|
|
50 |
# lr: 1e-3 |
|
|
51 |
# schedules: |
|
|
52 |
# "@schedules": linear |
|
|
53 |
# "warmup_rate": 0.1 |
|
|
54 |
# "start_value": 0 |
|
|
55 |
# "*": |
|
|
56 |
# lr: 1e-3 |
|
|
57 |
# schedules: |
|
|
58 |
# "@schedules": linear |
|
|
59 |
# "warmup_rate": 0.1 |
|
|
60 |
# "start_value": 1e-3 |
|
|
61 |
|
|
|
62 |
# 📚 DATA |
|
|
63 |
train_data: |
|
|
64 |
data: |
|
|
65 |
"@readers": json |
|
|
66 |
path: ./dataset.jsonl |
|
|
67 |
converter: |
|
|
68 |
- '@factory': 'myproject.custom_dict2doc' |
|
|
69 |
span_setter : 'gold_spans' |
|
|
70 |
span_attributes : ['negation'] |
|
|
71 |
bool_attributes : ['negation'] # default json to doc converter |
|
|
72 |
- '@factory': eds.sentences |
|
|
73 |
nlp: ${nlp} |
|
|
74 |
# - '@factory': eds.split |
|
|
75 |
# nlp: null |
|
|
76 |
# max_length: 10 |
|
|
77 |
# randomize: 0.3 |
|
|
78 |
# # sentence regex: |
|
|
79 |
# regex: '\\s*(?:\\n\\s*)+()[A-Z]|[.!?]\\s+()[A-Z]' |
|
|
80 |
# regex: '\\n{2,}' |
|
|
81 |
shuffle: dataset |
|
|
82 |
batch_size: 4 docs |
|
|
83 |
pipe_names: [ "qualifier" ] |
|
|
84 |
sub_batch_size: 10 words |
|
|
85 |
|
|
|
86 |
val_data: |
|
|
87 |
"@readers": json |
|
|
88 |
path: ./dataset.jsonl |
|
|
89 |
converter: |
|
|
90 |
- '@factory': myproject.custom_dict2doc |
|
|
91 |
span_setter : 'gold_spans' |
|
|
92 |
span_attributes : ['negation'] |
|
|
93 |
bool_attributes : ['negation'] # default standoff to doc converter |
|
|
94 |
|
|
|
95 |
# 🚀 TRAIN SCRIPT OPTIONS |
|
|
96 |
train: |
|
|
97 |
nlp: ${ nlp } |
|
|
98 |
train_data: ${ train_data } |
|
|
99 |
val_data: ${ val_data } |
|
|
100 |
max_steps: 40 |
|
|
101 |
validation_interval: 10 |
|
|
102 |
max_grad_norm: 1.0 |
|
|
103 |
scorer: ${ scorer } |
|
|
104 |
num_workers: 1 |