Diff of /base_config.json [000000] .. [79668b]

Switch to unified view

a b/base_config.json
1
{
2
  "adjacent_annotations_slack": "[\\. \\-]?[\\. ]?",
3
  "resolve_overlap_strategy": {
4
    "attributes": [
5
      "priority",
6
      "length"
7
    ],
8
    "ascending": [
9
      false,
10
      false
11
    ]
12
  },
13
  "redactor_open_char": "[",
14
  "redactor_close_char": "]",
15
  "annotators": {
16
    "prefix_with_initial": {
17
      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
18
      "group": "names",
19
      "args": {
20
        "tag": "prefix+initiaal",
21
        "skip": ["."],
22
        "pattern": [
23
          {
24
            "lookup": "prefix"
25
          },
26
          {
27
            "or": [
28
              {
29
                "lookup": "initial"
30
              },
31
              {
32
                "is_initials": true
33
              }
34
            ]
35
          }
36
        ]
37
      }
38
    },
39
    "prefix_with_interfix": {
40
      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
41
      "group": "names",
42
      "args": {
43
        "tag": "prefix+interfix+naam",
44
        "skip": ["."],
45
        "pattern": [
46
          {
47
            "lookup": "prefix"
48
          },
49
          {
50
            "lookup": "interfix"
51
          },
52
          {
53
            "like_name": true
54
          }
55
        ]
56
      }
57
    },
58
    "prefix_with_name": {
59
      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
60
      "group": "names",
61
      "args": {
62
        "tag": "prefix+naam",
63
        "skip": ["."],
64
        "pattern": [
65
          {
66
            "lookup": "prefix"
67
          },
68
          {
69
            "and": [
70
              {
71
                "like_name": true
72
              },
73
              {
74
                "neg_lookup": "whitelist"
75
              }
76
            ]
77
          }
78
        ]
79
      }
80
    },
81
    "interfix_with_name": {
82
      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
83
      "group": "names",
84
      "args": {
85
        "tag": "interfix+achternaam",
86
        "skip": [],
87
        "pattern": [
88
          {
89
            "lookup": "interfix"
90
          },
91
          {
92
            "and": [
93
              {
94
                "lookup": "interfix_surname"
95
              },
96
              {
97
                "neg_lookup": "whitelist"
98
              }
99
            ]
100
          }
101
        ]
102
      }
103
    },
104
    "initial_with_name": {
105
      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
106
      "group": "names",
107
      "args": {
108
        "tag": "initiaal+naam",
109
        "skip": ["."],
110
        "pattern": [
111
          {
112
            "lookup": "initial"
113
          },
114
          {
115
            "and": [
116
              {
117
                "like_name": true
118
              },
119
              {
120
                "neg_lookup": "whitelist"
121
              },
122
              {
123
                "neg_lookup": "prefix"
124
              }
125
            ]
126
          }
127
        ]
128
      }
129
    },
130
    "initial_interfix": {
131
      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
132
      "group": "names",
133
      "args": {
134
        "tag": "initiaal+interfix+naam",
135
        "skip": ["."],
136
        "pattern": [
137
          {
138
            "lookup": "initial"
139
          },
140
          {
141
            "lookup": "interfix"
142
          },
143
          {
144
            "like_name": true
145
          }
146
        ]
147
      }
148
    },
149
    "first_name_lookup": {
150
      "annotator_type": "docdeid.process.MultiTokenLookupAnnotator",
151
      "group": "names",
152
      "args": {
153
        "tag": "voornaam",
154
        "overlapping": true,
155
        "lookup_values": "first_name"
156
      }
157
    },
158
    "surname_lookup": {
159
      "annotator_type": "docdeid.process.MultiTokenLookupAnnotator",
160
      "group": "names",
161
      "args": {
162
        "tag": "achternaam",
163
        "overlapping": true,
164
        "lookup_values": "surname"
165
      }
166
    },
167
    "patient_name": {
168
      "annotator_type": "deduce.annotator.PatientNameAnnotator",
169
      "group": "names",
170
      "args": {
171
        "tag": "_"
172
      }
173
    },
174
    "name_context": {
175
      "annotator_type": "deduce.annotator.ContextAnnotator",
176
      "group": "names",
177
      "args": {
178
        "iterative": true,
179
        "pattern": [
180
          {
181
            "name": "interfix_right",
182
            "direction": "right",
183
            "pre_tag": [
184
              "initiaal",
185
              "naam",
186
              "voornaam",
187
              "achternaam",
188
              "voornaam_patient",
189
              "achternaam_patient"
190
            ],
191
            "tag": "{tag}+interfix+achternaam",
192
            "skip": [".", "-"],
193
            "pattern": [
194
              {
195
                "lookup": "interfix"
196
              },
197
              {
198
                "like_name": true
199
              }
200
            ]
201
          },
202
          {
203
            "name": "initial_left",
204
            "direction": "left",
205
            "pre_tag": [
206
              "initiaal",
207
              "naam",
208
              "voornaam",
209
              "achternaam",
210
              "voornaam_patient",
211
              "achternaam_patient",
212
              "interfix"
213
            ],
214
            "tag": "initiaal+{tag}",
215
            "skip": ["."],
216
            "pattern": [
217
              {
218
                "lookup": "initial"
219
              }
220
            ]
221
          },
222
          {
223
            "name": "naam_left",
224
            "direction": "left",
225
            "pre_tag": [
226
              "naam",
227
              "voornaam",
228
              "achternaam",
229
              "voornaam_patient",
230
              "achternaam_patient"
231
            ],
232
            "tag": "naam+{tag}",
233
            "skip": ["-"],
234
            "pattern": [
235
              {
236
                "and": [
237
                  {
238
                    "like_name": true
239
                  },
240
                  {
241
                    "neg_lookup": "whitelist"
242
                  },
243
                  {
244
                    "neg_lookup": "prefix"
245
                  }
246
                ]
247
              }
248
            ]
249
          },
250
          {
251
            "name": "naam_right",
252
            "direction": "right",
253
            "pre_tag": [
254
              "prefix",
255
              "initiaal",
256
              "naam",
257
              "voornaam",
258
              "achternaam",
259
              "voornaam_patient",
260
              "achternaam_patient",
261
              "interfix"
262
            ],
263
            "tag": "{tag}+naam",
264
            "skip": ["-"],
265
            "pattern": [
266
              {
267
                "and": [
268
                  {
269
                    "like_name": true
270
                  },
271
                  {
272
                    "neg_lookup": "whitelist"
273
                  },
274
                  {
275
                    "neg_lookup": "prefix"
276
                  }
277
                ]
278
              }
279
            ]
280
          },
281
          {
282
            "name": "prefix_left",
283
            "direction": "left",
284
            "pre_tag": [
285
              "prefix",
286
              "initiaal",
287
              "naam",
288
              "voornaam",
289
              "achternaam",
290
              "voornaam_patient",
291
              "achternaam_patient",
292
              "interfix"
293
            ],
294
            "tag": "prefix+{tag}",
295
            "skip": ["."],
296
            "pattern": [
297
              {
298
                "and": [
299
                  {
300
                    "lookup": "prefix"
301
                  }
302
                ]
303
              }
304
            ]
305
          }
306
        ]
307
      }
308
    },
309
    "eponymous_disease": {
310
      "annotator_type": "docdeid.process.MultiTokenLookupAnnotator",
311
      "group": "names",
312
      "args": {
313
        "lookup_values": "eponymous_disease",
314
        "tag": "pseudo_name",
315
        "overlapping": true
316
      }
317
    },
318
    "placename": {
319
      "annotator_type": "docdeid.process.MultiTokenLookupAnnotator",
320
      "group": "locations",
321
      "args": {
322
        "lookup_values": "placename",
323
        "overlapping": true,
324
        "tag": "locatie"
325
      }
326
    },
327
    "street_pattern": {
328
      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
329
      "group": "locations",
330
      "args": {
331
        "pattern": [
332
          {
333
            "re_match": "[A-Z][a-z]+(baan|bolwerk|dam|dijk|dreef|drf|dyk|gr|gracht|hf|hof|kade|laan|ln|markt|mrkt|pad|park|pd|plantsoen|plein|pln|plnts|prk|singel|sngl|st|steeg|stg|str|straat|weg|wg)$"
334
          }
335
        ],
336
        "tag": "straat",
337
        "priority": 1
338
      }
339
    },
340
    "street_lookup": {
341
      "annotator_type": "docdeid.process.MultiTokenLookupAnnotator",
342
      "group": "locations",
343
      "args": {
344
        "lookup_values": "street",
345
        "overlapping": true,
346
        "tag": "straat",
347
        "priority": 1
348
      }
349
    },
350
    "housenumber": {
351
      "annotator_type": "deduce.annotator.ContextAnnotator",
352
      "group": "locations",
353
      "args": {
354
        "iterative": true,
355
        "pattern": [
356
          {
357
            "name": "housenumber_right",
358
            "direction": "right",
359
            "pre_tag": [
360
              "straat"
361
            ],
362
            "tag": "{tag}+huisnummer",
363
            "skip": [],
364
            "pattern": [
365
              {
366
                "re_match": "\\d{1,4}$"
367
              }
368
            ]
369
          },
370
          {
371
            "name": "housenumber_housenumberletter_right",
372
            "direction": "right",
373
            "pre_tag": [
374
              "straat"
375
            ],
376
            "tag": "{tag}+huisnummer+huisnummerletter",
377
            "skip": [],
378
            "pattern": [
379
              {
380
                "re_match": "\\d{1,4}[a-zA-Z]$"
381
              }
382
            ]
383
          },
384
          {
385
            "name": "housenumberletter_right",
386
            "direction": "right",
387
            "pre_tag": [
388
              "huisnummer"
389
            ],
390
            "tag": "{tag}+huisnummerletter",
391
            "skip": [],
392
            "pattern": [
393
              {
394
                "re_match": "[a-zA-Z]$"
395
              }
396
            ]
397
          }
398
        ]
399
      }
400
    },
401
    "postal_code": {
402
      "annotator_type": "docdeid.process.RegexpAnnotator",
403
      "group": "locations",
404
      "args": {
405
        "regexp_pattern": "(\\d{4}([A-Za-z]{2}| [A-Z]{2}))(?<!mg|MG|gr|ie)(\\W|$)",
406
        "capturing_group": 1,
407
        "tag": "locatie"
408
      }
409
    },
410
    "postbus": {
411
      "annotator_type": "docdeid.process.RegexpAnnotator",
412
      "group": "locations",
413
      "args": {
414
        "regexp_pattern": "([Pp]ostbus\\s\\d{1,5}(\\.\\d{2,4})?)",
415
        "tag": "locatie",
416
        "pre_match_words": ["postbus"]
417
      }
418
    },
419
    "hospital": {
420
      "annotator_type": "docdeid.process.MultiTokenLookupAnnotator",
421
      "group": "institutions",
422
      "args": {
423
        "lookup_values": "hospital",
424
        "overlapping": true,
425
        "tag": "ziekenhuis"
426
      }
427
    },
428
    "institution": {
429
      "annotator_type": "docdeid.process.MultiTokenLookupAnnotator",
430
      "group": "institutions",
431
      "args": {
432
        "lookup_values": "healthcare_institution",
433
        "overlapping": true,
434
        "tag": "zorginstelling"
435
      }
436
    },
437
    "date_dmy_1": {
438
      "annotator_type": "docdeid.process.RegexpAnnotator",
439
      "group": "dates",
440
      "args": {
441
        "regexp_pattern": "(?<!\\d)(([1-9]|0[1-9]|[12][0-9]|3[01])(?P<sep>[-/\\. ])([1-9]|0[1-9]|1[012])(?P=sep)((19|20|\\'|`)?\\d{2}))(?!\\d)",
442
        "tag": "datum",
443
        "capturing_group": 1
444
      }
445
    },
446
    "date_dmy_2": {
447
      "annotator_type": "docdeid.process.RegexpAnnotator",
448
      "group": "dates",
449
      "args": {
450
        "regexp_pattern": "(?i)(?<!\\d)(([1-9]|0[1-9]|[12][0-9]|3[01])[-/\\. ]{,2}(januari|jan|februari|feb|maart|mrt|april|apr|mei|juni|jun|juli|jul|augustus|aug|september|sep|sept|oktober|okt|november|nov|december|dec)[-/\\. ]((19|20|\\'|`)?\\d{2}))(?!\\d)",
451
        "tag": "datum",
452
        "capturing_group": 1,
453
        "pre_match_words": ["januari", "jan", "februari", "feb", "maart", "mrt", "april", "apr", "mei", "juni", "jun", "juli", "jul", "augustus", "aug", "september", "sep", "sept", "oktober", "okt", "november", "nov", "december", "dec"]
454
      }
455
    },
456
    "date_ymd_1": {
457
      "annotator_type": "docdeid.process.RegexpAnnotator",
458
      "group": "dates",
459
      "args": {
460
        "regexp_pattern": "(?<!\\d)(((19|20|\\'|`)\\d{2})(?P<sep>[-/\\. ])([1-9]|0[1-9]|1[012])(?P=sep)([1-9]|0[1-9]|[12][0-9]|3[01]))(\\D|$)",
461
        "tag": "datum",
462
        "capturing_group": 1
463
      }
464
    },
465
    "date_ymd_2": {
466
      "annotator_type": "docdeid.process.RegexpAnnotator",
467
      "group": "dates",
468
      "args": {
469
        "regexp_pattern": "(?i)(?<!\\d)(((19|20|\\'|`)\\d{2})[-/\\. ]{,2}(januari|jan|februari|feb|maart|mrt|april|apr|mei|juni|jun|juli|jul|augustus|aug|september|sep|sept|oktober|okt|november|nov|december|dec)[-/\\. ]([1-9]|0[1-9]|[12][0-9]|3[01]))(?!\\d)",
470
        "tag": "datum",
471
        "capturing_group": 1,
472
        "pre_match_words": ["januari", "jan", "februari", "feb", "maart", "mrt", "april", "apr", "mei", "juni", "jun", "juli", "jul", "augustus", "aug", "september", "sep", "sept", "oktober", "okt", "november", "nov", "december", "dec"]
473
      }
474
    },
475
    "age": {
476
      "annotator_type": "deduce.annotator.RegexpPseudoAnnotator",
477
      "group": "ages",
478
      "args": {
479
        "regexp_pattern": "(?i)(?<![\\d,\\.])((1?\\d?\\d)([\\.,]5)?(-(1?\\d?\\d)([\\.,]5)?)?)([ -](jaar|jarig|jarige|jr))(?!\\w)",
480
        "pre_pseudo": ["<", "al", "co", "controle", "de", "elke", "gedurende", "na", "nog", "ongeveer", "over", "policontrole", "sinds", "up", "vanaf"],
481
        "post_pseudo": ["aanwezig", "gebruikt", "geleden", "gerookt", "gestaakt", "gestopt", "getrouwd", "na", "naar", "nadien"],
482
        "pre_match_words": ["jaar", "jarig", "jarige", "jr"],
483
        "tag": "leeftijd",
484
        "capturing_group": 1
485
      }
486
    },
487
    "bsn": {
488
      "annotator_type": "deduce.annotator.BsnAnnotator",
489
      "group": "identifiers",
490
      "args": {
491
        "bsn_regexp": "(?<!\\d)(\\d{9})(?!\\d)",
492
        "capture_group": 1,
493
        "priority": 100,
494
        "tag": "bsn"
495
      }
496
    },
497
    "identifier": {
498
      "annotator_type": "docdeid.process.RegexpAnnotator",
499
      "group": "identifiers",
500
      "args": {
501
        "regexp_pattern": "\\d{7,}",
502
        "tag": "id"
503
      }
504
    },
505
    "phone": {
506
      "annotator_type": "deduce.annotator.PhoneNumberAnnotator",
507
      "group": "phone_numbers",
508
      "args": {
509
        "phone_regexp": "(?<!\\d)(\\(?(0031|\\+31|0)(1[035]|2[0347]|3[03568]|4[03456]|5[0358]|6|7|88|800|91|90[069]|[1-5]\\d{2})\\)?) ?-? ?((\\d{2,4}[ -]?)+\\d{2,4})",
510
        "min_digits": 9,
511
        "max_digits": 11,
512
        "tag": "telefoonnummer"
513
      }
514
    },
515
    "email": {
516
      "annotator_type": "docdeid.process.RegexpAnnotator",
517
      "group": "email_addresses",
518
      "args": {
519
        "regexp_pattern": "(([-a-zA-Z0-9:%._\\+~#=]{1,256})@([-a-zA-Z0-9:%._\\+~#=]{1,256})(\\.)(com|net|org|co|us|uk|nl|be|fr|sp|gov|nu))",
520
        "tag": "emailadres",
521
        "pre_match_words": ["com", "net", "org", "co", "us", "uk", "nl", "be", "fr", "sp", "gov", "nu"]
522
      }
523
    },
524
    "url": {
525
      "annotator_type": "docdeid.process.RegexpAnnotator",
526
      "group": "urls",
527
      "args": {
528
        "regexp_pattern": "((https?:\\/\\/(?:www\\.)?)?([-a-zA-Z0-9:%._\\+~#=]{1,256})(\\.)(com|net|org|co|us|uk|nl|be|fr|sp|gov|nu)(\\b)([():%_\\+.~,]*[-a-zA-Z-0-9#?&/=]+)*)",
529
        "tag": "url",
530
        "pre_match_words": ["com", "net", "org", "co", "us", "uk", "nl", "be", "fr", "sp", "gov", "nu"]
531
      }
532
    }
533
  }
534
}