[d129b2]: / medicalbert / datareader / chunked_data_reader.py

Download this file

138 lines (99 with data), 4.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import logging
import os
import pandas as pd
import torch
from torch.utils.data import TensorDataset
from tqdm import tqdm
from datareader.FeatureSetBuilder import FeatureSetBuilder
from datareader.abstract_data_reader import AbstractDataReader, InputExample
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, input_mask, segment_ids, label_id):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
class ChunkedDataReader(AbstractDataReader):
def __init__(self, config, tokenizer):
self.tokenizer = tokenizer
self.max_sequence_length = config['max_sequence_length']
self.config = config
self.train = None
self.valid = None
self.test = None
self.num_sections = config['num_sections']
@staticmethod
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
def build_fresh_dataset(self, dataset):
logging.info("Building fresh dataset...")
df = pd.read_csv(os.path.join(self.config['data_dir'], dataset))
return self.build_fresh_dataset(df)
def _convert_rows_to_list_of_feature(self, df):
input_features = []
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
text = row['text']
lbl = row[self.config['target']]
input_example = InputExample(None, text, None, self.config['target'])
feature = self.convert_example_to_feature(input_example, lbl)
input_features.append(feature)
return input_features
def build_fresh_dataset(self, dataset):
df = pd.read_csv(os.path.join(self.config['data_dir'], dataset))
return self.build_fresh_dataset_from_df(df)
def build_fresh_dataset_from_df(self, df):
logging.info("Building fresh dataset...")
features = self._convert_rows_to_list_of_feature(df)
# Now parse them out into the proper parts.
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
return TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
def convert_example_to_feature(self, example, label):
# create a new feature set builder for this example
inputFeatureBuilder = FeatureSetBuilder(label)
# tokenize the text into a list
tokens_a = self.tokenizer.tokenize(example.text_a)
# chunk the list of tokens
generator = self.chunks(tokens_a, self.max_sequence_length - 2)
for section in generator:
# convert the section to a feature
section_feature = self.convert_section_to_feature(section, label)
inputFeatureBuilder.add(section_feature)
inputFeatureBuilder.resize(self.num_sections, self.convert_section_to_feature([0], label))
assert len(inputFeatureBuilder.get()) == self.num_sections
# We return the builder
input_ids = [feature.input_ids for feature in inputFeatureBuilder.features ]
input_masks = [feature.input_mask for feature in inputFeatureBuilder.features]
segment_ids = [feature.segment_ids for feature in inputFeatureBuilder.features]
# Now create a new 'type' of inputfeature
return InputFeatures(input_ids, input_masks, segment_ids, label)
def convert_section_to_feature(self, tokens_a, label):
# Truncate the section if needed
if len(tokens_a) > (self.max_sequence_length - 2):
tokens_a = tokens_a[-(self.max_sequence_length - 2):]
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < self.max_sequence_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == self.max_sequence_length
assert len(input_mask) == self.max_sequence_length
assert len(segment_ids) == self.max_sequence_length
return InputFeatures(input_ids, input_mask, segment_ids, label)