Diff of /base_config.json [000000] .. [79668b]

Switch to side-by-side view

--- a
+++ b/base_config.json
@@ -0,0 +1,534 @@
+{
+  "adjacent_annotations_slack": "[\\. \\-]?[\\. ]?",
+  "resolve_overlap_strategy": {
+    "attributes": [
+      "priority",
+      "length"
+    ],
+    "ascending": [
+      false,
+      false
+    ]
+  },
+  "redactor_open_char": "[",
+  "redactor_close_char": "]",
+  "annotators": {
+    "prefix_with_initial": {
+      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
+      "group": "names",
+      "args": {
+        "tag": "prefix+initiaal",
+        "skip": ["."],
+        "pattern": [
+          {
+            "lookup": "prefix"
+          },
+          {
+            "or": [
+              {
+                "lookup": "initial"
+              },
+              {
+                "is_initials": true
+              }
+            ]
+          }
+        ]
+      }
+    },
+    "prefix_with_interfix": {
+      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
+      "group": "names",
+      "args": {
+        "tag": "prefix+interfix+naam",
+        "skip": ["."],
+        "pattern": [
+          {
+            "lookup": "prefix"
+          },
+          {
+            "lookup": "interfix"
+          },
+          {
+            "like_name": true
+          }
+        ]
+      }
+    },
+    "prefix_with_name": {
+      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
+      "group": "names",
+      "args": {
+        "tag": "prefix+naam",
+        "skip": ["."],
+        "pattern": [
+          {
+            "lookup": "prefix"
+          },
+          {
+            "and": [
+              {
+                "like_name": true
+              },
+              {
+                "neg_lookup": "whitelist"
+              }
+            ]
+          }
+        ]
+      }
+    },
+    "interfix_with_name": {
+      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
+      "group": "names",
+      "args": {
+        "tag": "interfix+achternaam",
+        "skip": [],
+        "pattern": [
+          {
+            "lookup": "interfix"
+          },
+          {
+            "and": [
+              {
+                "lookup": "interfix_surname"
+              },
+              {
+                "neg_lookup": "whitelist"
+              }
+            ]
+          }
+        ]
+      }
+    },
+    "initial_with_name": {
+      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
+      "group": "names",
+      "args": {
+        "tag": "initiaal+naam",
+        "skip": ["."],
+        "pattern": [
+          {
+            "lookup": "initial"
+          },
+          {
+            "and": [
+              {
+                "like_name": true
+              },
+              {
+                "neg_lookup": "whitelist"
+              },
+              {
+                "neg_lookup": "prefix"
+              }
+            ]
+          }
+        ]
+      }
+    },
+    "initial_interfix": {
+      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
+      "group": "names",
+      "args": {
+        "tag": "initiaal+interfix+naam",
+        "skip": ["."],
+        "pattern": [
+          {
+            "lookup": "initial"
+          },
+          {
+            "lookup": "interfix"
+          },
+          {
+            "like_name": true
+          }
+        ]
+      }
+    },
+    "first_name_lookup": {
+      "annotator_type": "docdeid.process.MultiTokenLookupAnnotator",
+      "group": "names",
+      "args": {
+        "tag": "voornaam",
+        "overlapping": true,
+        "lookup_values": "first_name"
+      }
+    },
+    "surname_lookup": {
+      "annotator_type": "docdeid.process.MultiTokenLookupAnnotator",
+      "group": "names",
+      "args": {
+        "tag": "achternaam",
+        "overlapping": true,
+        "lookup_values": "surname"
+      }
+    },
+    "patient_name": {
+      "annotator_type": "deduce.annotator.PatientNameAnnotator",
+      "group": "names",
+      "args": {
+        "tag": "_"
+      }
+    },
+    "name_context": {
+      "annotator_type": "deduce.annotator.ContextAnnotator",
+      "group": "names",
+      "args": {
+        "iterative": true,
+        "pattern": [
+          {
+            "name": "interfix_right",
+            "direction": "right",
+            "pre_tag": [
+              "initiaal",
+              "naam",
+              "voornaam",
+              "achternaam",
+              "voornaam_patient",
+              "achternaam_patient"
+            ],
+            "tag": "{tag}+interfix+achternaam",
+            "skip": [".", "-"],
+            "pattern": [
+              {
+                "lookup": "interfix"
+              },
+              {
+                "like_name": true
+              }
+            ]
+          },
+          {
+            "name": "initial_left",
+            "direction": "left",
+            "pre_tag": [
+              "initiaal",
+              "naam",
+              "voornaam",
+              "achternaam",
+              "voornaam_patient",
+              "achternaam_patient",
+              "interfix"
+            ],
+            "tag": "initiaal+{tag}",
+            "skip": ["."],
+            "pattern": [
+              {
+                "lookup": "initial"
+              }
+            ]
+          },
+          {
+            "name": "naam_left",
+            "direction": "left",
+            "pre_tag": [
+              "naam",
+              "voornaam",
+              "achternaam",
+              "voornaam_patient",
+              "achternaam_patient"
+            ],
+            "tag": "naam+{tag}",
+            "skip": ["-"],
+            "pattern": [
+              {
+                "and": [
+                  {
+                    "like_name": true
+                  },
+                  {
+                    "neg_lookup": "whitelist"
+                  },
+                  {
+                    "neg_lookup": "prefix"
+                  }
+                ]
+              }
+            ]
+          },
+          {
+            "name": "naam_right",
+            "direction": "right",
+            "pre_tag": [
+              "prefix",
+              "initiaal",
+              "naam",
+              "voornaam",
+              "achternaam",
+              "voornaam_patient",
+              "achternaam_patient",
+              "interfix"
+            ],
+            "tag": "{tag}+naam",
+            "skip": ["-"],
+            "pattern": [
+              {
+                "and": [
+                  {
+                    "like_name": true
+                  },
+                  {
+                    "neg_lookup": "whitelist"
+                  },
+                  {
+                    "neg_lookup": "prefix"
+                  }
+                ]
+              }
+            ]
+          },
+          {
+            "name": "prefix_left",
+            "direction": "left",
+            "pre_tag": [
+              "prefix",
+              "initiaal",
+              "naam",
+              "voornaam",
+              "achternaam",
+              "voornaam_patient",
+              "achternaam_patient",
+              "interfix"
+            ],
+            "tag": "prefix+{tag}",
+            "skip": ["."],
+            "pattern": [
+              {
+                "and": [
+                  {
+                    "lookup": "prefix"
+                  }
+                ]
+              }
+            ]
+          }
+        ]
+      }
+    },
+    "eponymous_disease": {
+      "annotator_type": "docdeid.process.MultiTokenLookupAnnotator",
+      "group": "names",
+      "args": {
+        "lookup_values": "eponymous_disease",
+        "tag": "pseudo_name",
+        "overlapping": true
+      }
+    },
+    "placename": {
+      "annotator_type": "docdeid.process.MultiTokenLookupAnnotator",
+      "group": "locations",
+      "args": {
+        "lookup_values": "placename",
+        "overlapping": true,
+        "tag": "locatie"
+      }
+    },
+    "street_pattern": {
+      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
+      "group": "locations",
+      "args": {
+        "pattern": [
+          {
+            "re_match": "[A-Z][a-z]+(baan|bolwerk|dam|dijk|dreef|drf|dyk|gr|gracht|hf|hof|kade|laan|ln|markt|mrkt|pad|park|pd|plantsoen|plein|pln|plnts|prk|singel|sngl|st|steeg|stg|str|straat|weg|wg)$"
+          }
+        ],
+        "tag": "straat",
+        "priority": 1
+      }
+    },
+    "street_lookup": {
+      "annotator_type": "docdeid.process.MultiTokenLookupAnnotator",
+      "group": "locations",
+      "args": {
+        "lookup_values": "street",
+        "overlapping": true,
+        "tag": "straat",
+        "priority": 1
+      }
+    },
+    "housenumber": {
+      "annotator_type": "deduce.annotator.ContextAnnotator",
+      "group": "locations",
+      "args": {
+        "iterative": true,
+        "pattern": [
+          {
+            "name": "housenumber_right",
+            "direction": "right",
+            "pre_tag": [
+              "straat"
+            ],
+            "tag": "{tag}+huisnummer",
+            "skip": [],
+            "pattern": [
+              {
+                "re_match": "\\d{1,4}$"
+              }
+            ]
+          },
+          {
+            "name": "housenumber_housenumberletter_right",
+            "direction": "right",
+            "pre_tag": [
+              "straat"
+            ],
+            "tag": "{tag}+huisnummer+huisnummerletter",
+            "skip": [],
+            "pattern": [
+              {
+                "re_match": "\\d{1,4}[a-zA-Z]$"
+              }
+            ]
+          },
+          {
+            "name": "housenumberletter_right",
+            "direction": "right",
+            "pre_tag": [
+              "huisnummer"
+            ],
+            "tag": "{tag}+huisnummerletter",
+            "skip": [],
+            "pattern": [
+              {
+                "re_match": "[a-zA-Z]$"
+              }
+            ]
+          }
+        ]
+      }
+    },
+    "postal_code": {
+      "annotator_type": "docdeid.process.RegexpAnnotator",
+      "group": "locations",
+      "args": {
+        "regexp_pattern": "(\\d{4}([A-Za-z]{2}| [A-Z]{2}))(?<!mg|MG|gr|ie)(\\W|$)",
+        "capturing_group": 1,
+        "tag": "locatie"
+      }
+    },
+    "postbus": {
+      "annotator_type": "docdeid.process.RegexpAnnotator",
+      "group": "locations",
+      "args": {
+        "regexp_pattern": "([Pp]ostbus\\s\\d{1,5}(\\.\\d{2,4})?)",
+        "tag": "locatie",
+        "pre_match_words": ["postbus"]
+      }
+    },
+    "hospital": {
+      "annotator_type": "docdeid.process.MultiTokenLookupAnnotator",
+      "group": "institutions",
+      "args": {
+        "lookup_values": "hospital",
+        "overlapping": true,
+        "tag": "ziekenhuis"
+      }
+    },
+    "institution": {
+      "annotator_type": "docdeid.process.MultiTokenLookupAnnotator",
+      "group": "institutions",
+      "args": {
+        "lookup_values": "healthcare_institution",
+        "overlapping": true,
+        "tag": "zorginstelling"
+      }
+    },
+    "date_dmy_1": {
+      "annotator_type": "docdeid.process.RegexpAnnotator",
+      "group": "dates",
+      "args": {
+        "regexp_pattern": "(?<!\\d)(([1-9]|0[1-9]|[12][0-9]|3[01])(?P<sep>[-/\\. ])([1-9]|0[1-9]|1[012])(?P=sep)((19|20|\\'|`)?\\d{2}))(?!\\d)",
+        "tag": "datum",
+        "capturing_group": 1
+      }
+    },
+    "date_dmy_2": {
+      "annotator_type": "docdeid.process.RegexpAnnotator",
+      "group": "dates",
+      "args": {
+        "regexp_pattern": "(?i)(?<!\\d)(([1-9]|0[1-9]|[12][0-9]|3[01])[-/\\. ]{,2}(januari|jan|februari|feb|maart|mrt|april|apr|mei|juni|jun|juli|jul|augustus|aug|september|sep|sept|oktober|okt|november|nov|december|dec)[-/\\. ]((19|20|\\'|`)?\\d{2}))(?!\\d)",
+        "tag": "datum",
+        "capturing_group": 1,
+        "pre_match_words": ["januari", "jan", "februari", "feb", "maart", "mrt", "april", "apr", "mei", "juni", "jun", "juli", "jul", "augustus", "aug", "september", "sep", "sept", "oktober", "okt", "november", "nov", "december", "dec"]
+      }
+    },
+    "date_ymd_1": {
+      "annotator_type": "docdeid.process.RegexpAnnotator",
+      "group": "dates",
+      "args": {
+        "regexp_pattern": "(?<!\\d)(((19|20|\\'|`)\\d{2})(?P<sep>[-/\\. ])([1-9]|0[1-9]|1[012])(?P=sep)([1-9]|0[1-9]|[12][0-9]|3[01]))(\\D|$)",
+        "tag": "datum",
+        "capturing_group": 1
+      }
+    },
+    "date_ymd_2": {
+      "annotator_type": "docdeid.process.RegexpAnnotator",
+      "group": "dates",
+      "args": {
+        "regexp_pattern": "(?i)(?<!\\d)(((19|20|\\'|`)\\d{2})[-/\\. ]{,2}(januari|jan|februari|feb|maart|mrt|april|apr|mei|juni|jun|juli|jul|augustus|aug|september|sep|sept|oktober|okt|november|nov|december|dec)[-/\\. ]([1-9]|0[1-9]|[12][0-9]|3[01]))(?!\\d)",
+        "tag": "datum",
+        "capturing_group": 1,
+        "pre_match_words": ["januari", "jan", "februari", "feb", "maart", "mrt", "april", "apr", "mei", "juni", "jun", "juli", "jul", "augustus", "aug", "september", "sep", "sept", "oktober", "okt", "november", "nov", "december", "dec"]
+      }
+    },
+    "age": {
+      "annotator_type": "deduce.annotator.RegexpPseudoAnnotator",
+      "group": "ages",
+      "args": {
+        "regexp_pattern": "(?i)(?<![\\d,\\.])((1?\\d?\\d)([\\.,]5)?(-(1?\\d?\\d)([\\.,]5)?)?)([ -](jaar|jarig|jarige|jr))(?!\\w)",
+        "pre_pseudo": ["<", "al", "co", "controle", "de", "elke", "gedurende", "na", "nog", "ongeveer", "over", "policontrole", "sinds", "up", "vanaf"],
+        "post_pseudo": ["aanwezig", "gebruikt", "geleden", "gerookt", "gestaakt", "gestopt", "getrouwd", "na", "naar", "nadien"],
+        "pre_match_words": ["jaar", "jarig", "jarige", "jr"],
+        "tag": "leeftijd",
+        "capturing_group": 1
+      }
+    },
+    "bsn": {
+      "annotator_type": "deduce.annotator.BsnAnnotator",
+      "group": "identifiers",
+      "args": {
+        "bsn_regexp": "(?<!\\d)(\\d{9})(?!\\d)",
+        "capture_group": 1,
+        "priority": 100,
+        "tag": "bsn"
+      }
+    },
+    "identifier": {
+      "annotator_type": "docdeid.process.RegexpAnnotator",
+      "group": "identifiers",
+      "args": {
+        "regexp_pattern": "\\d{7,}",
+        "tag": "id"
+      }
+    },
+    "phone": {
+      "annotator_type": "deduce.annotator.PhoneNumberAnnotator",
+      "group": "phone_numbers",
+      "args": {
+        "phone_regexp": "(?<!\\d)(\\(?(0031|\\+31|0)(1[035]|2[0347]|3[03568]|4[03456]|5[0358]|6|7|88|800|91|90[069]|[1-5]\\d{2})\\)?) ?-? ?((\\d{2,4}[ -]?)+\\d{2,4})",
+        "min_digits": 9,
+        "max_digits": 11,
+        "tag": "telefoonnummer"
+      }
+    },
+    "email": {
+      "annotator_type": "docdeid.process.RegexpAnnotator",
+      "group": "email_addresses",
+      "args": {
+        "regexp_pattern": "(([-a-zA-Z0-9:%._\\+~#=]{1,256})@([-a-zA-Z0-9:%._\\+~#=]{1,256})(\\.)(com|net|org|co|us|uk|nl|be|fr|sp|gov|nu))",
+        "tag": "emailadres",
+        "pre_match_words": ["com", "net", "org", "co", "us", "uk", "nl", "be", "fr", "sp", "gov", "nu"]
+      }
+    },
+    "url": {
+      "annotator_type": "docdeid.process.RegexpAnnotator",
+      "group": "urls",
+      "args": {
+        "regexp_pattern": "((https?:\\/\\/(?:www\\.)?)?([-a-zA-Z0-9:%._\\+~#=]{1,256})(\\.)(com|net|org|co|us|uk|nl|be|fr|sp|gov|nu)(\\b)([():%_\\+.~,]*[-a-zA-Z-0-9#?&/=]+)*)",
+        "tag": "url",
+        "pre_match_words": ["com", "net", "org", "co", "us", "uk", "nl", "be", "fr", "sp", "gov", "nu"]
+      }
+    }
+  }
+}
\ No newline at end of file