--- a +++ b/base_config.json @@ -0,0 +1,534 @@ +{ + "adjacent_annotations_slack": "[\\. \\-]?[\\. ]?", + "resolve_overlap_strategy": { + "attributes": [ + "priority", + "length" + ], + "ascending": [ + false, + false + ] + }, + "redactor_open_char": "[", + "redactor_close_char": "]", + "annotators": { + "prefix_with_initial": { + "annotator_type": "deduce.annotator.TokenPatternAnnotator", + "group": "names", + "args": { + "tag": "prefix+initiaal", + "skip": ["."], + "pattern": [ + { + "lookup": "prefix" + }, + { + "or": [ + { + "lookup": "initial" + }, + { + "is_initials": true + } + ] + } + ] + } + }, + "prefix_with_interfix": { + "annotator_type": "deduce.annotator.TokenPatternAnnotator", + "group": "names", + "args": { + "tag": "prefix+interfix+naam", + "skip": ["."], + "pattern": [ + { + "lookup": "prefix" + }, + { + "lookup": "interfix" + }, + { + "like_name": true + } + ] + } + }, + "prefix_with_name": { + "annotator_type": "deduce.annotator.TokenPatternAnnotator", + "group": "names", + "args": { + "tag": "prefix+naam", + "skip": ["."], + "pattern": [ + { + "lookup": "prefix" + }, + { + "and": [ + { + "like_name": true + }, + { + "neg_lookup": "whitelist" + } + ] + } + ] + } + }, + "interfix_with_name": { + "annotator_type": "deduce.annotator.TokenPatternAnnotator", + "group": "names", + "args": { + "tag": "interfix+achternaam", + "skip": [], + "pattern": [ + { + "lookup": "interfix" + }, + { + "and": [ + { + "lookup": "interfix_surname" + }, + { + "neg_lookup": "whitelist" + } + ] + } + ] + } + }, + "initial_with_name": { + "annotator_type": "deduce.annotator.TokenPatternAnnotator", + "group": "names", + "args": { + "tag": "initiaal+naam", + "skip": ["."], + "pattern": [ + { + "lookup": "initial" + }, + { + "and": [ + { + "like_name": true + }, + { + "neg_lookup": "whitelist" + }, + { + "neg_lookup": "prefix" + } + ] + } + ] + } + }, + "initial_interfix": { + "annotator_type": "deduce.annotator.TokenPatternAnnotator", + "group": "names", + "args": { + "tag": "initiaal+interfix+naam", + "skip": ["."], + "pattern": [ + { + "lookup": "initial" + }, + { + "lookup": "interfix" + }, + { + "like_name": true + } + ] + } + }, + "first_name_lookup": { + "annotator_type": "docdeid.process.MultiTokenLookupAnnotator", + "group": "names", + "args": { + "tag": "voornaam", + "overlapping": true, + "lookup_values": "first_name" + } + }, + "surname_lookup": { + "annotator_type": "docdeid.process.MultiTokenLookupAnnotator", + "group": "names", + "args": { + "tag": "achternaam", + "overlapping": true, + "lookup_values": "surname" + } + }, + "patient_name": { + "annotator_type": "deduce.annotator.PatientNameAnnotator", + "group": "names", + "args": { + "tag": "_" + } + }, + "name_context": { + "annotator_type": "deduce.annotator.ContextAnnotator", + "group": "names", + "args": { + "iterative": true, + "pattern": [ + { + "name": "interfix_right", + "direction": "right", + "pre_tag": [ + "initiaal", + "naam", + "voornaam", + "achternaam", + "voornaam_patient", + "achternaam_patient" + ], + "tag": "{tag}+interfix+achternaam", + "skip": [".", "-"], + "pattern": [ + { + "lookup": "interfix" + }, + { + "like_name": true + } + ] + }, + { + "name": "initial_left", + "direction": "left", + "pre_tag": [ + "initiaal", + "naam", + "voornaam", + "achternaam", + "voornaam_patient", + "achternaam_patient", + "interfix" + ], + "tag": "initiaal+{tag}", + "skip": ["."], + "pattern": [ + { + "lookup": "initial" + } + ] + }, + { + "name": "naam_left", + "direction": "left", + "pre_tag": [ + "naam", + "voornaam", + "achternaam", + "voornaam_patient", + "achternaam_patient" + ], + "tag": "naam+{tag}", + "skip": ["-"], + "pattern": [ + { + "and": [ + { + "like_name": true + }, + { + "neg_lookup": "whitelist" + }, + { + "neg_lookup": "prefix" + } + ] + } + ] + }, + { + "name": "naam_right", + "direction": "right", + "pre_tag": [ + "prefix", + "initiaal", + "naam", + "voornaam", + "achternaam", + "voornaam_patient", + "achternaam_patient", + "interfix" + ], + "tag": "{tag}+naam", + "skip": ["-"], + "pattern": [ + { + "and": [ + { + "like_name": true + }, + { + "neg_lookup": "whitelist" + }, + { + "neg_lookup": "prefix" + } + ] + } + ] + }, + { + "name": "prefix_left", + "direction": "left", + "pre_tag": [ + "prefix", + "initiaal", + "naam", + "voornaam", + "achternaam", + "voornaam_patient", + "achternaam_patient", + "interfix" + ], + "tag": "prefix+{tag}", + "skip": ["."], + "pattern": [ + { + "and": [ + { + "lookup": "prefix" + } + ] + } + ] + } + ] + } + }, + "eponymous_disease": { + "annotator_type": "docdeid.process.MultiTokenLookupAnnotator", + "group": "names", + "args": { + "lookup_values": "eponymous_disease", + "tag": "pseudo_name", + "overlapping": true + } + }, + "placename": { + "annotator_type": "docdeid.process.MultiTokenLookupAnnotator", + "group": "locations", + "args": { + "lookup_values": "placename", + "overlapping": true, + "tag": "locatie" + } + }, + "street_pattern": { + "annotator_type": "deduce.annotator.TokenPatternAnnotator", + "group": "locations", + "args": { + "pattern": [ + { + "re_match": "[A-Z][a-z]+(baan|bolwerk|dam|dijk|dreef|drf|dyk|gr|gracht|hf|hof|kade|laan|ln|markt|mrkt|pad|park|pd|plantsoen|plein|pln|plnts|prk|singel|sngl|st|steeg|stg|str|straat|weg|wg)$" + } + ], + "tag": "straat", + "priority": 1 + } + }, + "street_lookup": { + "annotator_type": "docdeid.process.MultiTokenLookupAnnotator", + "group": "locations", + "args": { + "lookup_values": "street", + "overlapping": true, + "tag": "straat", + "priority": 1 + } + }, + "housenumber": { + "annotator_type": "deduce.annotator.ContextAnnotator", + "group": "locations", + "args": { + "iterative": true, + "pattern": [ + { + "name": "housenumber_right", + "direction": "right", + "pre_tag": [ + "straat" + ], + "tag": "{tag}+huisnummer", + "skip": [], + "pattern": [ + { + "re_match": "\\d{1,4}$" + } + ] + }, + { + "name": "housenumber_housenumberletter_right", + "direction": "right", + "pre_tag": [ + "straat" + ], + "tag": "{tag}+huisnummer+huisnummerletter", + "skip": [], + "pattern": [ + { + "re_match": "\\d{1,4}[a-zA-Z]$" + } + ] + }, + { + "name": "housenumberletter_right", + "direction": "right", + "pre_tag": [ + "huisnummer" + ], + "tag": "{tag}+huisnummerletter", + "skip": [], + "pattern": [ + { + "re_match": "[a-zA-Z]$" + } + ] + } + ] + } + }, + "postal_code": { + "annotator_type": "docdeid.process.RegexpAnnotator", + "group": "locations", + "args": { + "regexp_pattern": "(\\d{4}([A-Za-z]{2}| [A-Z]{2}))(?<!mg|MG|gr|ie)(\\W|$)", + "capturing_group": 1, + "tag": "locatie" + } + }, + "postbus": { + "annotator_type": "docdeid.process.RegexpAnnotator", + "group": "locations", + "args": { + "regexp_pattern": "([Pp]ostbus\\s\\d{1,5}(\\.\\d{2,4})?)", + "tag": "locatie", + "pre_match_words": ["postbus"] + } + }, + "hospital": { + "annotator_type": "docdeid.process.MultiTokenLookupAnnotator", + "group": "institutions", + "args": { + "lookup_values": "hospital", + "overlapping": true, + "tag": "ziekenhuis" + } + }, + "institution": { + "annotator_type": "docdeid.process.MultiTokenLookupAnnotator", + "group": "institutions", + "args": { + "lookup_values": "healthcare_institution", + "overlapping": true, + "tag": "zorginstelling" + } + }, + "date_dmy_1": { + "annotator_type": "docdeid.process.RegexpAnnotator", + "group": "dates", + "args": { + "regexp_pattern": "(?<!\\d)(([1-9]|0[1-9]|[12][0-9]|3[01])(?P<sep>[-/\\. ])([1-9]|0[1-9]|1[012])(?P=sep)((19|20|\\'|`)?\\d{2}))(?!\\d)", + "tag": "datum", + "capturing_group": 1 + } + }, + "date_dmy_2": { + "annotator_type": "docdeid.process.RegexpAnnotator", + "group": "dates", + "args": { + "regexp_pattern": "(?i)(?<!\\d)(([1-9]|0[1-9]|[12][0-9]|3[01])[-/\\. ]{,2}(januari|jan|februari|feb|maart|mrt|april|apr|mei|juni|jun|juli|jul|augustus|aug|september|sep|sept|oktober|okt|november|nov|december|dec)[-/\\. ]((19|20|\\'|`)?\\d{2}))(?!\\d)", + "tag": "datum", + "capturing_group": 1, + "pre_match_words": ["januari", "jan", "februari", "feb", "maart", "mrt", "april", "apr", "mei", "juni", "jun", "juli", "jul", "augustus", "aug", "september", "sep", "sept", "oktober", "okt", "november", "nov", "december", "dec"] + } + }, + "date_ymd_1": { + "annotator_type": "docdeid.process.RegexpAnnotator", + "group": "dates", + "args": { + "regexp_pattern": "(?<!\\d)(((19|20|\\'|`)\\d{2})(?P<sep>[-/\\. ])([1-9]|0[1-9]|1[012])(?P=sep)([1-9]|0[1-9]|[12][0-9]|3[01]))(\\D|$)", + "tag": "datum", + "capturing_group": 1 + } + }, + "date_ymd_2": { + "annotator_type": "docdeid.process.RegexpAnnotator", + "group": "dates", + "args": { + "regexp_pattern": "(?i)(?<!\\d)(((19|20|\\'|`)\\d{2})[-/\\. ]{,2}(januari|jan|februari|feb|maart|mrt|april|apr|mei|juni|jun|juli|jul|augustus|aug|september|sep|sept|oktober|okt|november|nov|december|dec)[-/\\. ]([1-9]|0[1-9]|[12][0-9]|3[01]))(?!\\d)", + "tag": "datum", + "capturing_group": 1, + "pre_match_words": ["januari", "jan", "februari", "feb", "maart", "mrt", "april", "apr", "mei", "juni", "jun", "juli", "jul", "augustus", "aug", "september", "sep", "sept", "oktober", "okt", "november", "nov", "december", "dec"] + } + }, + "age": { + "annotator_type": "deduce.annotator.RegexpPseudoAnnotator", + "group": "ages", + "args": { + "regexp_pattern": "(?i)(?<![\\d,\\.])((1?\\d?\\d)([\\.,]5)?(-(1?\\d?\\d)([\\.,]5)?)?)([ -](jaar|jarig|jarige|jr))(?!\\w)", + "pre_pseudo": ["<", "al", "co", "controle", "de", "elke", "gedurende", "na", "nog", "ongeveer", "over", "policontrole", "sinds", "up", "vanaf"], + "post_pseudo": ["aanwezig", "gebruikt", "geleden", "gerookt", "gestaakt", "gestopt", "getrouwd", "na", "naar", "nadien"], + "pre_match_words": ["jaar", "jarig", "jarige", "jr"], + "tag": "leeftijd", + "capturing_group": 1 + } + }, + "bsn": { + "annotator_type": "deduce.annotator.BsnAnnotator", + "group": "identifiers", + "args": { + "bsn_regexp": "(?<!\\d)(\\d{9})(?!\\d)", + "capture_group": 1, + "priority": 100, + "tag": "bsn" + } + }, + "identifier": { + "annotator_type": "docdeid.process.RegexpAnnotator", + "group": "identifiers", + "args": { + "regexp_pattern": "\\d{7,}", + "tag": "id" + } + }, + "phone": { + "annotator_type": "deduce.annotator.PhoneNumberAnnotator", + "group": "phone_numbers", + "args": { + "phone_regexp": "(?<!\\d)(\\(?(0031|\\+31|0)(1[035]|2[0347]|3[03568]|4[03456]|5[0358]|6|7|88|800|91|90[069]|[1-5]\\d{2})\\)?) ?-? ?((\\d{2,4}[ -]?)+\\d{2,4})", + "min_digits": 9, + "max_digits": 11, + "tag": "telefoonnummer" + } + }, + "email": { + "annotator_type": "docdeid.process.RegexpAnnotator", + "group": "email_addresses", + "args": { + "regexp_pattern": "(([-a-zA-Z0-9:%._\\+~#=]{1,256})@([-a-zA-Z0-9:%._\\+~#=]{1,256})(\\.)(com|net|org|co|us|uk|nl|be|fr|sp|gov|nu))", + "tag": "emailadres", + "pre_match_words": ["com", "net", "org", "co", "us", "uk", "nl", "be", "fr", "sp", "gov", "nu"] + } + }, + "url": { + "annotator_type": "docdeid.process.RegexpAnnotator", + "group": "urls", + "args": { + "regexp_pattern": "((https?:\\/\\/(?:www\\.)?)?([-a-zA-Z0-9:%._\\+~#=]{1,256})(\\.)(com|net|org|co|us|uk|nl|be|fr|sp|gov|nu)(\\b)([():%_\\+.~,]*[-a-zA-Z-0-9#?&/=]+)*)", + "tag": "url", + "pre_match_words": ["com", "net", "org", "co", "us", "uk", "nl", "be", "fr", "sp", "gov", "nu"] + } + } + } +} \ No newline at end of file