|
a |
|
b/src/constants/n2c2.py |
|
|
1 |
# coding: utf-8 |
|
|
2 |
|
|
|
3 |
# Base Dependencies |
|
|
4 |
# ----------------- |
|
|
5 |
import re |
|
|
6 |
from pathlib import Path |
|
|
7 |
from os.path import join as pjoin |
|
|
8 |
|
|
|
9 |
|
|
|
10 |
N2C2_VOCAB_PATH = Path(pjoin("data", "n2c2", "vocab", "vocab.txt")) |
|
|
11 |
|
|
|
12 |
N2C2_PATH = Path("data/n2c2") |
|
|
13 |
N2C2_ENTITY_TYPES = [ |
|
|
14 |
"Drug", |
|
|
15 |
"Strength", |
|
|
16 |
"Duration", |
|
|
17 |
"Route", |
|
|
18 |
"Form", |
|
|
19 |
"ADE", |
|
|
20 |
"Dosage", |
|
|
21 |
"Reason", |
|
|
22 |
"Frequency", |
|
|
23 |
] |
|
|
24 |
N2C2_ATTR_TYPES = [ |
|
|
25 |
"Strength", |
|
|
26 |
"Duration", |
|
|
27 |
"Route", |
|
|
28 |
"Form", |
|
|
29 |
"ADE", |
|
|
30 |
"Dosage", |
|
|
31 |
"Reason", |
|
|
32 |
"Frequency", |
|
|
33 |
] |
|
|
34 |
|
|
|
35 |
N2C2_REL_TYPES = [ |
|
|
36 |
"Strength-Drug", |
|
|
37 |
"Duration-Drug", |
|
|
38 |
"Route-Drug", |
|
|
39 |
"Form-Drug", |
|
|
40 |
"ADE-Drug", |
|
|
41 |
"Dosage-Drug", |
|
|
42 |
"Reason-Drug", |
|
|
43 |
"Frequency-Drug", |
|
|
44 |
] |
|
|
45 |
|
|
|
46 |
N2C2_REL_TEST_WEIGHTS = [ |
|
|
47 |
10255 / 41086, |
|
|
48 |
568 / 41086, |
|
|
49 |
6784 / 41086, |
|
|
50 |
5382 / 41086, |
|
|
51 |
981 / 41086, |
|
|
52 |
3563 / 41086, |
|
|
53 |
4335 / 41086, |
|
|
54 |
9218 / 41086, |
|
|
55 |
] |
|
|
56 |
|
|
|
57 |
N2C2_ATTR_ENTITY_CANDIDATES = { |
|
|
58 |
"Strength": ["Drug"], |
|
|
59 |
"Duration": ["Drug"], |
|
|
60 |
"Route": ["Drug"], |
|
|
61 |
"Form": ["Drug"], |
|
|
62 |
"ADE": ["Drug"], |
|
|
63 |
"Dosage": ["Drug"], |
|
|
64 |
"Reason": ["Drug"], |
|
|
65 |
"Frequency": ["Drug"], |
|
|
66 |
} |
|
|
67 |
|
|
|
68 |
N2C2_SPLITS_DIR = "data/n2c2/splits" |
|
|
69 |
N2C2_HF_TRAIN_PATH = "data/n2c2/train.hf" |
|
|
70 |
N2C2_HF_TEST_PATH = "data/n2c2/test.hf" |
|
|
71 |
|
|
|
72 |
|
|
|
73 |
N2C2_ANNONYM_PATTERNS = { |
|
|
74 |
"hour": re.compile(r"\[\*\*\d+-\d+\*\*\]\s*PM"), |
|
|
75 |
"date": re.compile( |
|
|
76 |
r"(\[\*\*(Date|Month|Year)[^\*]*\*\*\])|(\[\*\*\d+-\d+-?\d*\*\*\])" |
|
|
77 |
), |
|
|
78 |
"hospital": re.compile(r"(\[\*\*[^\*]*Hospital[^\*]*\*\*\])"), |
|
|
79 |
"name": re.compile(r"(\[\*\*[^\*]*(Name|name)[^\*]*\*\*\])"), |
|
|
80 |
"telephone": re.compile(r"(\[\*\*[^\*]*(Telephone|telephone)[^\*]*\*\*\])"), |
|
|
81 |
"location": re.compile(r"(\[\*\*[^\*]*(Location|location|\d+-/\d+)[^\*]*\*\*\])"), |
|
|
82 |
"address": re.compile(r"(\[\*\*[^\*]*(Address|address|Country|State)[^\*]*\*\*\])"), |
|
|
83 |
"age": re.compile(r"(\[\*\*[^\*]*(Age)[^\*]*\*\*\])"), |
|
|
84 |
"number": re.compile( |
|
|
85 |
r"(\[\*\*[^\*]*(Number|Numeric Identifier|number)[^\*]*\*\*\])|(\[\*\*\d+\*\*\])" |
|
|
86 |
), |
|
|
87 |
} |
|
|
88 |
|
|
|
89 |
N2C2_IOB_TAGS = [ |
|
|
90 |
"O", |
|
|
91 |
"B-Drug", |
|
|
92 |
"I-Drug", |
|
|
93 |
"B-Strength", |
|
|
94 |
"I-Strength", |
|
|
95 |
"B-Duration", |
|
|
96 |
"I-Duration", |
|
|
97 |
"B-Route", |
|
|
98 |
"I-Route", |
|
|
99 |
"B-Form", |
|
|
100 |
"I-Form", |
|
|
101 |
"B-ADE", |
|
|
102 |
"I-ADE", |
|
|
103 |
"B-Dosage", |
|
|
104 |
"I-Dosage", |
|
|
105 |
"B-Reason", |
|
|
106 |
"I-Reason", |
|
|
107 |
"B-Frequency", |
|
|
108 |
"I-Frequency", |
|
|
109 |
] |
|
|
110 |
|
|
|
111 |
N2C2_RD_MAX = 30 |