--- a +++ b/src/constants/n2c2.py @@ -0,0 +1,111 @@ +# coding: utf-8 + +# Base Dependencies +# ----------------- +import re +from pathlib import Path +from os.path import join as pjoin + + +N2C2_VOCAB_PATH = Path(pjoin("data", "n2c2", "vocab", "vocab.txt")) + +N2C2_PATH = Path("data/n2c2") +N2C2_ENTITY_TYPES = [ + "Drug", + "Strength", + "Duration", + "Route", + "Form", + "ADE", + "Dosage", + "Reason", + "Frequency", +] +N2C2_ATTR_TYPES = [ + "Strength", + "Duration", + "Route", + "Form", + "ADE", + "Dosage", + "Reason", + "Frequency", +] + +N2C2_REL_TYPES = [ + "Strength-Drug", + "Duration-Drug", + "Route-Drug", + "Form-Drug", + "ADE-Drug", + "Dosage-Drug", + "Reason-Drug", + "Frequency-Drug", +] + +N2C2_REL_TEST_WEIGHTS = [ + 10255 / 41086, + 568 / 41086, + 6784 / 41086, + 5382 / 41086, + 981 / 41086, + 3563 / 41086, + 4335 / 41086, + 9218 / 41086, +] + +N2C2_ATTR_ENTITY_CANDIDATES = { + "Strength": ["Drug"], + "Duration": ["Drug"], + "Route": ["Drug"], + "Form": ["Drug"], + "ADE": ["Drug"], + "Dosage": ["Drug"], + "Reason": ["Drug"], + "Frequency": ["Drug"], +} + +N2C2_SPLITS_DIR = "data/n2c2/splits" +N2C2_HF_TRAIN_PATH = "data/n2c2/train.hf" +N2C2_HF_TEST_PATH = "data/n2c2/test.hf" + + +N2C2_ANNONYM_PATTERNS = { + "hour": re.compile(r"\[\*\*\d+-\d+\*\*\]\s*PM"), + "date": re.compile( + r"(\[\*\*(Date|Month|Year)[^\*]*\*\*\])|(\[\*\*\d+-\d+-?\d*\*\*\])" + ), + "hospital": re.compile(r"(\[\*\*[^\*]*Hospital[^\*]*\*\*\])"), + "name": re.compile(r"(\[\*\*[^\*]*(Name|name)[^\*]*\*\*\])"), + "telephone": re.compile(r"(\[\*\*[^\*]*(Telephone|telephone)[^\*]*\*\*\])"), + "location": re.compile(r"(\[\*\*[^\*]*(Location|location|\d+-/\d+)[^\*]*\*\*\])"), + "address": re.compile(r"(\[\*\*[^\*]*(Address|address|Country|State)[^\*]*\*\*\])"), + "age": re.compile(r"(\[\*\*[^\*]*(Age)[^\*]*\*\*\])"), + "number": re.compile( + r"(\[\*\*[^\*]*(Number|Numeric Identifier|number)[^\*]*\*\*\])|(\[\*\*\d+\*\*\])" + ), +} + +N2C2_IOB_TAGS = [ + "O", + "B-Drug", + "I-Drug", + "B-Strength", + "I-Strength", + "B-Duration", + "I-Duration", + "B-Route", + "I-Route", + "B-Form", + "I-Form", + "B-ADE", + "I-ADE", + "B-Dosage", + "I-Dosage", + "B-Reason", + "I-Reason", + "B-Frequency", + "I-Frequency", +] + +N2C2_RD_MAX = 30