[1de6ed]: / biobert_re / data_processor.py

Download this file

149 lines (115 with data), 4.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import os
from enum import Enum
from typing import List, Optional, Union
import logging
from transformers import (DataProcessor,
InputExample,
InputFeatures,
PreTrainedTokenizer)
logger = logging.getLogger(__name__)
def glue_convert_examples_to_features(
examples: List[InputExample],
tokenizer: PreTrainedTokenizer,
max_length: Optional[int] = None,
task=None,
label_list=None,
output_mode=None,
):
"""
Loads a data file into a list of ``InputFeatures``
Args:
examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
tokenizer: Instance of a tokenizer that will tokenize the examples
max_length: Maximum example length. Defaults to the tokenizer's max_len
task: GLUE task
label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
output_mode: String indicating the output mode, classification
Returns:
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the
task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific
``InputFeatures`` which can be fed to the model.
"""
return _glue_convert_examples_to_features(
examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode
)
def _glue_convert_examples_to_features(
examples: List[InputExample],
tokenizer: PreTrainedTokenizer,
max_length: Optional[int] = None,
task=None,
label_list=None,
output_mode=None,
):
if max_length is None:
max_length = tokenizer.max_len
if task is not None:
processor = glue_processors[task]()
if label_list is None:
label_list = processor.get_labels()
logger.info("Using label list %s for task %s" % (label_list, task))
if output_mode is None:
output_mode = glue_output_modes[task]
logger.info("Using output mode %s for task %s" % (output_mode, task))
label_map = {label: i for i, label in enumerate(label_list)}
def label_from_example(example: InputExample) -> Union[int, float, None]:
if example.label is None:
return None
return label_map[example.label]
labels = [label_from_example(example) for example in examples]
batch_encoding = tokenizer(
[[example.text_a, example.text_b] for example in examples],
max_length=max_length,
padding="max_length",
truncation=True,
)
features = []
for i in range(len(examples)):
inputs = {k: batch_encoding[k][i] for k in batch_encoding}
feature = InputFeatures(**inputs, label=labels[i])
features.append(feature)
for i, example in enumerate(examples[:5]):
logger.info("*** Example ***")
logger.info("guid: %s" % example.guid)
logger.info("features: %s" % features[i])
return features
class OutputMode(Enum):
classification = "classification"
class EHRProcessor(DataProcessor):
"""Processor for EHR data."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["sentence"].numpy().decode("utf-8"),
None,
str(tensor_dict["label"].numpy()),
)
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training, dev and test sets."""
examples = []
text_index = 1 if set_type == "test" else 0
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = line[text_index]
label = None if set_type == "test" else line[1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples
glue_tasks_num_labels = {"ehr-re": 2}
glue_processors = {"ehr-re": EHRProcessor}
glue_output_modes = {"ehr-re": "classification"}