Diff of /generate_data.py [000000] .. [1de6ed]

Switch to unified view

a b/generate_data.py
1
import argparse
2
3
from utils import read_data, save_pickle, read_ade_data
4
from biobert_ner.utils_ner import generate_input_files
5
from biobert_re.utils_re import generate_re_input_files
6
from typing import List, Iterator, Dict
7
import warnings
8
import os
9
import re
10
11
labels = ['B-DRUG', 'I-DRUG', 'B-STR', 'I-STR', 'B-DUR', 'I-DUR',
12
          'B-ROU', 'I-ROU', 'B-FOR', 'I-FOR', 'B-ADE', 'I-ADE',
13
          'B-DOS', 'I-DOS', 'B-REA', 'I-REA', 'B-FRE', 'I-FRE', 'O']
14
15
16
def parse_arguments():
17
    """Parses program arguments"""
18
    parser = argparse.ArgumentParser()
19
    parser.add_argument("--task", type=str,
20
                        help="Task to be completed. 'NER', 'RE'. Default is 'NER'.",
21
                        default="NER")
22
23
    parser.add_argument("--input_dir", type=str,
24
                        help="Directory with txt and ann files. Default is 'data/'.",
25
                        default="data/")
26
27
    parser.add_argument("--ade_dir", type=str,
28
                        help="Directory with ADE corpus. Default is None.",
29
                        default=None)
30
31
    parser.add_argument("--target_dir", type=str,
32
                        help="Directory to save files. Default is 'dataset/'.",
33
                        default='dataset/')
34
35
    parser.add_argument("--max_seq_len", type=int,
36
                        help="Maximum sequence length. Default is 512.",
37
                        default=512)
38
39
    parser.add_argument("--dev_split", type=float,
40
                        help="Ratio of dev data. Default is 0.1",
41
                        default=0.1)
42
43
    parser.add_argument("--tokenizer", type=str,
44
                        help="The tokenizer to use. 'scispacy', 'scispacy_plus', 'biobert-base', 'biobert-large', 'default'.",
45
                        default="scispacy")
46
47
    parser.add_argument("--ext", type=str,
48
                        help="Extension of target file. Default is txt.",
49
                        default="txt")
50
51
    parser.add_argument("--sep", type=str,
52
                        help="Token-label separator. Default is a space.",
53
                        default=" ")
54
55
    arguments = parser.parse_args()
56
    return arguments
57
58
59
def default_tokenizer(sequence: str) -> List[str]:
60
    """A tokenizer that splits sequence by a whitespace."""
61
    words = re.split("\n| |\t", sequence)
62
    tokens = []
63
    for word in words:
64
        word = word.strip()
65
66
        if not word:
67
            continue
68
69
        tokens.append(word)
70
71
    return tokens
72
73
74
def scispacy_plus_tokenizer(sequence: str, scispacy_tok=None) -> Iterator[str]:
75
    """
76
    Runs the scispacy tokenizer and removes all tokens with
77
    just whitespace characters
78
    """
79
    if scispacy_tok is None:
80
        import en_ner_bc5cdr_md
81
        scispacy_tok = en_ner_bc5cdr_md.load().tokenizer
82
83
    scispacy_tokens = list(map(lambda x: str(x), scispacy_tok(sequence)))
84
    tokens = filter(lambda t: not (' ' in t or '\n' in t or '\t' in t), scispacy_tokens)
85
86
    return tokens
87
88
89
def ner_generator(files: Dict[str, tuple], args) -> None:
90
    """Generates files for NER"""
91
    # Generate train, dev, test files
92
    for filename, data in files.items():
93
        generate_input_files(ehr_records=data[0], ade_records=data[1],
94
                             filename=args.target_dir + filename + '.' + args.ext,
95
                             max_len=args.max_seq_len, sep=args.sep)
96
        save_pickle(args.target_dir + filename, {"EHR": data[0], "ADE": data[1]})
97
98
    # Generate labels file
99
    with open(args.target_dir + 'labels.txt', 'w') as file:
100
        output_labels = map(lambda x: x + '\n', labels)
101
        file.writelines(output_labels)
102
103
    filenames = [name for files in map(
104
        lambda x: [x + '.' + args.ext, x + '.pkl'],
105
        list(files.keys()))
106
                 for name in files]
107
108
    print("\nGenerating files successful. Files generated: ",
109
          ', '.join(filenames), ', labels.txt', sep='')
110
111
112
def re_generator(files: Dict[str, tuple], args):
113
    """Generates files for RE"""
114
    for filename, data in files.items():
115
        generate_re_input_files(ehr_records=data[0], ade_records=data[1],
116
                                filename=args.target_dir + filename + '.' + args.ext,
117
                                max_len=args.max_seq_len, sep=args.sep,
118
                                is_test=data[2], is_label=data[3])
119
120
    save_pickle(args.target_dir + 'train', {"EHR": files['train'][0], "ADE": files['train'][1]})
121
    save_pickle(args.target_dir + 'test',  {"EHR": files['test'][0],  "ADE": files['test'][1]})
122
123
    print("\nGenerating files successful. Files generated: ",
124
          'train.tsv,', 'dev.tsv,', 'test.tsv,',
125
          'test_labels.tsv,', 'train_rel.pkl,', 'test_rel.pkl,', 'test_labels_rel.pkl', sep=' ')
126
127
128
def main():
129
    args = parse_arguments()
130
131
    if args.target_dir[-1] != '/':
132
        args.target_dir += '/'
133
134
    if args.sep == "tab":
135
        args.sep = "\t"
136
137
    if not os.path.isdir(args.target_dir):
138
        os.mkdir(args.target_dir)
139
140
    if args.tokenizer == "default":
141
        tokenizer = default_tokenizer
142
        is_bert_tokenizer = False
143
144
    elif args.tokenizer == "scispacy":
145
        import en_ner_bc5cdr_md
146
        tokenizer = en_ner_bc5cdr_md.load().tokenizer
147
        is_bert_tokenizer = False
148
149
    elif args.tokenizer == 'scispacy_plus':
150
        import en_ner_bc5cdr_md
151
        scispacy_tok = en_ner_bc5cdr_md.load().tokenizer
152
        scispacy_plus_tokenizer.__defaults__ = (scispacy_tok,)
153
154
        tokenizer = scispacy_plus_tokenizer
155
        is_bert_tokenizer = False
156
157
    elif args.tokenizer == 'biobert-large':
158
        from transformers import AutoTokenizer
159
        biobert = AutoTokenizer.from_pretrained(
160
            "dmis-lab/biobert-large-cased-v1.1")
161
162
        args.max_seq_len -= biobert.num_special_tokens_to_add()
163
        tokenizer = biobert.tokenize
164
        is_bert_tokenizer = True
165
166
    elif args.tokenizer == 'biobert-base':
167
        from transformers import AutoTokenizer
168
        biobert = AutoTokenizer.from_pretrained(
169
            "dmis-lab/biobert-base-cased-v1.1")
170
171
        args.max_seq_len -= biobert.num_special_tokens_to_add()
172
        tokenizer = biobert.tokenize
173
        is_bert_tokenizer = True
174
175
    else:
176
        warnings.warn("Tokenizer named " + args.tokenizer + " not found."
177
                      "Using default tokenizer instead. Acceptable values"
178
                      "include 'scispacy', 'biobert-base', 'biobert-large',"
179
                      "and 'default'.")
180
        tokenizer = default_tokenizer
181
        is_bert_tokenizer = False
182
183
    print("\nReading data\n")
184
    train_dev, test = read_data(data_dir=args.input_dir,
185
                                tokenizer=tokenizer,
186
                                is_bert_tokenizer=is_bert_tokenizer,
187
                                verbose=1)
188
189
    if args.ade_dir is not None:
190
        ade_train_dev = read_ade_data(ade_data_dir=args.ade_dir, verbose=1)
191
192
        ade_dev_split_idx = int((1 - args.dev_split) * len(ade_train_dev))
193
        ade_train = ade_train_dev[:ade_dev_split_idx]
194
        ade_devel = ade_train_dev[ade_dev_split_idx:]
195
196
    else:
197
        ade_train_dev = None
198
        ade_train = None
199
        ade_devel = None
200
201
    print('\n')
202
203
    # Data is already shuffled, just split for dev set
204
    dev_split_idx = int((1 - args.dev_split) * len(train_dev))
205
    train = train_dev[:dev_split_idx]
206
    devel = train_dev[dev_split_idx:]
207
208
    # Data for NER
209
    if args.task.lower() == 'ner':
210
        files = {'train': (train, ade_train), 'train_dev': (train_dev, ade_train_dev),
211
                 'devel': (devel, ade_devel), 'test': (test, None)}
212
213
        ner_generator(files, args)
214
215
    # Data for RE
216
    elif args.task.lower() == 're':
217
        # {dataset_name: (ehr_data, ade_data, is_test, is_label)}
218
        files = {'train': (train, ade_train, False, True), 'dev': (devel, ade_devel, False, True),
219
                 'test': (test, None, True, False), 'test_labels': (test, None, True, True)}
220
221
        re_generator(files, args)
222
223
224
if __name__ == '__main__':
225
    main()