|
a |
|
b/generate_data.py |
|
|
1 |
import argparse |
|
|
2 |
|
|
|
3 |
from utils import read_data, save_pickle, read_ade_data |
|
|
4 |
from biobert_ner.utils_ner import generate_input_files |
|
|
5 |
from biobert_re.utils_re import generate_re_input_files |
|
|
6 |
from typing import List, Iterator, Dict |
|
|
7 |
import warnings |
|
|
8 |
import os |
|
|
9 |
import re |
|
|
10 |
|
|
|
11 |
labels = ['B-DRUG', 'I-DRUG', 'B-STR', 'I-STR', 'B-DUR', 'I-DUR', |
|
|
12 |
'B-ROU', 'I-ROU', 'B-FOR', 'I-FOR', 'B-ADE', 'I-ADE', |
|
|
13 |
'B-DOS', 'I-DOS', 'B-REA', 'I-REA', 'B-FRE', 'I-FRE', 'O'] |
|
|
14 |
|
|
|
15 |
|
|
|
16 |
def parse_arguments(): |
|
|
17 |
"""Parses program arguments""" |
|
|
18 |
parser = argparse.ArgumentParser() |
|
|
19 |
parser.add_argument("--task", type=str, |
|
|
20 |
help="Task to be completed. 'NER', 'RE'. Default is 'NER'.", |
|
|
21 |
default="NER") |
|
|
22 |
|
|
|
23 |
parser.add_argument("--input_dir", type=str, |
|
|
24 |
help="Directory with txt and ann files. Default is 'data/'.", |
|
|
25 |
default="data/") |
|
|
26 |
|
|
|
27 |
parser.add_argument("--ade_dir", type=str, |
|
|
28 |
help="Directory with ADE corpus. Default is None.", |
|
|
29 |
default=None) |
|
|
30 |
|
|
|
31 |
parser.add_argument("--target_dir", type=str, |
|
|
32 |
help="Directory to save files. Default is 'dataset/'.", |
|
|
33 |
default='dataset/') |
|
|
34 |
|
|
|
35 |
parser.add_argument("--max_seq_len", type=int, |
|
|
36 |
help="Maximum sequence length. Default is 512.", |
|
|
37 |
default=512) |
|
|
38 |
|
|
|
39 |
parser.add_argument("--dev_split", type=float, |
|
|
40 |
help="Ratio of dev data. Default is 0.1", |
|
|
41 |
default=0.1) |
|
|
42 |
|
|
|
43 |
parser.add_argument("--tokenizer", type=str, |
|
|
44 |
help="The tokenizer to use. 'scispacy', 'scispacy_plus', 'biobert-base', 'biobert-large', 'default'.", |
|
|
45 |
default="scispacy") |
|
|
46 |
|
|
|
47 |
parser.add_argument("--ext", type=str, |
|
|
48 |
help="Extension of target file. Default is txt.", |
|
|
49 |
default="txt") |
|
|
50 |
|
|
|
51 |
parser.add_argument("--sep", type=str, |
|
|
52 |
help="Token-label separator. Default is a space.", |
|
|
53 |
default=" ") |
|
|
54 |
|
|
|
55 |
arguments = parser.parse_args() |
|
|
56 |
return arguments |
|
|
57 |
|
|
|
58 |
|
|
|
59 |
def default_tokenizer(sequence: str) -> List[str]: |
|
|
60 |
"""A tokenizer that splits sequence by a whitespace.""" |
|
|
61 |
words = re.split("\n| |\t", sequence) |
|
|
62 |
tokens = [] |
|
|
63 |
for word in words: |
|
|
64 |
word = word.strip() |
|
|
65 |
|
|
|
66 |
if not word: |
|
|
67 |
continue |
|
|
68 |
|
|
|
69 |
tokens.append(word) |
|
|
70 |
|
|
|
71 |
return tokens |
|
|
72 |
|
|
|
73 |
|
|
|
74 |
def scispacy_plus_tokenizer(sequence: str, scispacy_tok=None) -> Iterator[str]: |
|
|
75 |
""" |
|
|
76 |
Runs the scispacy tokenizer and removes all tokens with |
|
|
77 |
just whitespace characters |
|
|
78 |
""" |
|
|
79 |
if scispacy_tok is None: |
|
|
80 |
import en_ner_bc5cdr_md |
|
|
81 |
scispacy_tok = en_ner_bc5cdr_md.load().tokenizer |
|
|
82 |
|
|
|
83 |
scispacy_tokens = list(map(lambda x: str(x), scispacy_tok(sequence))) |
|
|
84 |
tokens = filter(lambda t: not (' ' in t or '\n' in t or '\t' in t), scispacy_tokens) |
|
|
85 |
|
|
|
86 |
return tokens |
|
|
87 |
|
|
|
88 |
|
|
|
89 |
def ner_generator(files: Dict[str, tuple], args) -> None: |
|
|
90 |
"""Generates files for NER""" |
|
|
91 |
# Generate train, dev, test files |
|
|
92 |
for filename, data in files.items(): |
|
|
93 |
generate_input_files(ehr_records=data[0], ade_records=data[1], |
|
|
94 |
filename=args.target_dir + filename + '.' + args.ext, |
|
|
95 |
max_len=args.max_seq_len, sep=args.sep) |
|
|
96 |
save_pickle(args.target_dir + filename, {"EHR": data[0], "ADE": data[1]}) |
|
|
97 |
|
|
|
98 |
# Generate labels file |
|
|
99 |
with open(args.target_dir + 'labels.txt', 'w') as file: |
|
|
100 |
output_labels = map(lambda x: x + '\n', labels) |
|
|
101 |
file.writelines(output_labels) |
|
|
102 |
|
|
|
103 |
filenames = [name for files in map( |
|
|
104 |
lambda x: [x + '.' + args.ext, x + '.pkl'], |
|
|
105 |
list(files.keys())) |
|
|
106 |
for name in files] |
|
|
107 |
|
|
|
108 |
print("\nGenerating files successful. Files generated: ", |
|
|
109 |
', '.join(filenames), ', labels.txt', sep='') |
|
|
110 |
|
|
|
111 |
|
|
|
112 |
def re_generator(files: Dict[str, tuple], args): |
|
|
113 |
"""Generates files for RE""" |
|
|
114 |
for filename, data in files.items(): |
|
|
115 |
generate_re_input_files(ehr_records=data[0], ade_records=data[1], |
|
|
116 |
filename=args.target_dir + filename + '.' + args.ext, |
|
|
117 |
max_len=args.max_seq_len, sep=args.sep, |
|
|
118 |
is_test=data[2], is_label=data[3]) |
|
|
119 |
|
|
|
120 |
save_pickle(args.target_dir + 'train', {"EHR": files['train'][0], "ADE": files['train'][1]}) |
|
|
121 |
save_pickle(args.target_dir + 'test', {"EHR": files['test'][0], "ADE": files['test'][1]}) |
|
|
122 |
|
|
|
123 |
print("\nGenerating files successful. Files generated: ", |
|
|
124 |
'train.tsv,', 'dev.tsv,', 'test.tsv,', |
|
|
125 |
'test_labels.tsv,', 'train_rel.pkl,', 'test_rel.pkl,', 'test_labels_rel.pkl', sep=' ') |
|
|
126 |
|
|
|
127 |
|
|
|
128 |
def main(): |
|
|
129 |
args = parse_arguments() |
|
|
130 |
|
|
|
131 |
if args.target_dir[-1] != '/': |
|
|
132 |
args.target_dir += '/' |
|
|
133 |
|
|
|
134 |
if args.sep == "tab": |
|
|
135 |
args.sep = "\t" |
|
|
136 |
|
|
|
137 |
if not os.path.isdir(args.target_dir): |
|
|
138 |
os.mkdir(args.target_dir) |
|
|
139 |
|
|
|
140 |
if args.tokenizer == "default": |
|
|
141 |
tokenizer = default_tokenizer |
|
|
142 |
is_bert_tokenizer = False |
|
|
143 |
|
|
|
144 |
elif args.tokenizer == "scispacy": |
|
|
145 |
import en_ner_bc5cdr_md |
|
|
146 |
tokenizer = en_ner_bc5cdr_md.load().tokenizer |
|
|
147 |
is_bert_tokenizer = False |
|
|
148 |
|
|
|
149 |
elif args.tokenizer == 'scispacy_plus': |
|
|
150 |
import en_ner_bc5cdr_md |
|
|
151 |
scispacy_tok = en_ner_bc5cdr_md.load().tokenizer |
|
|
152 |
scispacy_plus_tokenizer.__defaults__ = (scispacy_tok,) |
|
|
153 |
|
|
|
154 |
tokenizer = scispacy_plus_tokenizer |
|
|
155 |
is_bert_tokenizer = False |
|
|
156 |
|
|
|
157 |
elif args.tokenizer == 'biobert-large': |
|
|
158 |
from transformers import AutoTokenizer |
|
|
159 |
biobert = AutoTokenizer.from_pretrained( |
|
|
160 |
"dmis-lab/biobert-large-cased-v1.1") |
|
|
161 |
|
|
|
162 |
args.max_seq_len -= biobert.num_special_tokens_to_add() |
|
|
163 |
tokenizer = biobert.tokenize |
|
|
164 |
is_bert_tokenizer = True |
|
|
165 |
|
|
|
166 |
elif args.tokenizer == 'biobert-base': |
|
|
167 |
from transformers import AutoTokenizer |
|
|
168 |
biobert = AutoTokenizer.from_pretrained( |
|
|
169 |
"dmis-lab/biobert-base-cased-v1.1") |
|
|
170 |
|
|
|
171 |
args.max_seq_len -= biobert.num_special_tokens_to_add() |
|
|
172 |
tokenizer = biobert.tokenize |
|
|
173 |
is_bert_tokenizer = True |
|
|
174 |
|
|
|
175 |
else: |
|
|
176 |
warnings.warn("Tokenizer named " + args.tokenizer + " not found." |
|
|
177 |
"Using default tokenizer instead. Acceptable values" |
|
|
178 |
"include 'scispacy', 'biobert-base', 'biobert-large'," |
|
|
179 |
"and 'default'.") |
|
|
180 |
tokenizer = default_tokenizer |
|
|
181 |
is_bert_tokenizer = False |
|
|
182 |
|
|
|
183 |
print("\nReading data\n") |
|
|
184 |
train_dev, test = read_data(data_dir=args.input_dir, |
|
|
185 |
tokenizer=tokenizer, |
|
|
186 |
is_bert_tokenizer=is_bert_tokenizer, |
|
|
187 |
verbose=1) |
|
|
188 |
|
|
|
189 |
if args.ade_dir is not None: |
|
|
190 |
ade_train_dev = read_ade_data(ade_data_dir=args.ade_dir, verbose=1) |
|
|
191 |
|
|
|
192 |
ade_dev_split_idx = int((1 - args.dev_split) * len(ade_train_dev)) |
|
|
193 |
ade_train = ade_train_dev[:ade_dev_split_idx] |
|
|
194 |
ade_devel = ade_train_dev[ade_dev_split_idx:] |
|
|
195 |
|
|
|
196 |
else: |
|
|
197 |
ade_train_dev = None |
|
|
198 |
ade_train = None |
|
|
199 |
ade_devel = None |
|
|
200 |
|
|
|
201 |
print('\n') |
|
|
202 |
|
|
|
203 |
# Data is already shuffled, just split for dev set |
|
|
204 |
dev_split_idx = int((1 - args.dev_split) * len(train_dev)) |
|
|
205 |
train = train_dev[:dev_split_idx] |
|
|
206 |
devel = train_dev[dev_split_idx:] |
|
|
207 |
|
|
|
208 |
# Data for NER |
|
|
209 |
if args.task.lower() == 'ner': |
|
|
210 |
files = {'train': (train, ade_train), 'train_dev': (train_dev, ade_train_dev), |
|
|
211 |
'devel': (devel, ade_devel), 'test': (test, None)} |
|
|
212 |
|
|
|
213 |
ner_generator(files, args) |
|
|
214 |
|
|
|
215 |
# Data for RE |
|
|
216 |
elif args.task.lower() == 're': |
|
|
217 |
# {dataset_name: (ehr_data, ade_data, is_test, is_label)} |
|
|
218 |
files = {'train': (train, ade_train, False, True), 'dev': (devel, ade_devel, False, True), |
|
|
219 |
'test': (test, None, True, False), 'test_labels': (test, None, True, True)} |
|
|
220 |
|
|
|
221 |
re_generator(files, args) |
|
|
222 |
|
|
|
223 |
|
|
|
224 |
if __name__ == '__main__': |
|
|
225 |
main() |