[6c353a]: / medacy / pipeline_components / learners / nn / vectorizer.py

Download this file

320 lines (253 with data), 11.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
"""
Vectorizer for medaCy PyTorch classes.
"""
import re
import string
import torch
import unicodedata
from gensim.models import KeyedVectors
class Vectorizer:
"""Vectorizer for medaCy PyTorch data. Contains encoding methods and tracking encoding values.
:ivar device: PyTorch device to use.
:ivar word_vectors: Gensim Word2VecKeyedVectors for word embeddings.
:ivar untrained_tokens: Out of vocabulary tokens.
:ivar other_features: Features other than word embeddings or ids.
:ivar window_size: Number of tokens to include on either side of current token.
:ivar tag_to_index: Dictionary of label to id mappings.
:ivar character_to_index: Dictionary of character to id mappings.
"""
def __init__(self, device):
"""Initialize Vectorizer.
:param device: PyTorch device to use.
"""
self.device = device
self.word_vectors = None
self.untrained_tokens = set()
self.other_features = {}
self.window_size = 0
self.tag_to_index = {}
self.character_to_index = {
character: index for index, character in enumerate(string.printable, 1)
}
def load_word_embeddings(self, embeddings_file):
"""Uses self.word_embeddings_file and gensim to load word embeddings into memory.
:param embeddings_file: Word embeddings file to use. Can be .bin or other common formats.
"""
is_binary = embeddings_file.endswith('.bin')
word_vectors = KeyedVectors.load_word2vec_format(embeddings_file, binary=is_binary)
self.word_vectors = word_vectors
def create_tag_dictionary(self, tags):
"""Setup self.tag_to_index
:param tags: List of list of tag names. Usually all true labels for a dataset.
"""
tag_to_index = {}
for sequence in tags:
for tag in sequence:
if tag not in tag_to_index:
tag_to_index[tag] = len(tag_to_index)
self.tag_to_index = tag_to_index
def add_tag(self, tag):
"""Add tag to self.tag_to_index
:param tag: Tag to add.
"""
self.tag_to_index[tag] = len(self.tag_to_index)
def create_feature_dictionary(self, feature_name, sentences):
"""Get dictionary that maps all possible values of a specific feature to ids.
:param feature_name: Name of feature.
:param sentences: Sentences to get feature for.
:return: Dictionary for given feature.
"""
feature_to_index = {}
feature_name = '0:' + feature_name
for sentence in sentences:
for token in sentence:
feature = token[feature_name]
if feature not in feature_to_index:
feature_to_index[feature] = len(feature_to_index)
return feature_to_index
def find_other_features(self, example):
"""Get the names of the other word features being used.
:param example: One set of features to search through for the names.
"""
if '0:text' not in example:
raise ValueError('BiLSTM-CRF requires the "0:text" spaCy feature.')
# Find other feature names
for key in example:
if key.startswith('0:') and key != '0:text':
feature = key[2:]
self.other_features[feature] = {}
def find_window_size(self, x_data):
"""Find and set the window size based on input data. Only supports single digit window
sizes.
:param x_data: Input data to use.
"""
# Find longest sequence and use token in center for analysis
test_token = None
longest_length = 0
for sentence in x_data:
if len(sentence) > longest_length:
longest_length = len(sentence)
test_token = sentence[int(longest_length/2)]
lowest = 0
highest = 0
# Loop through keys in test token to find highest and lowest window distances.
for key in test_token:
if key[0] == '-':
index = int(key[:2])
if index < lowest:
lowest = index
elif key[0].isnumeric():
index = int(key[0])
if index > highest:
highest = index
assert -lowest == highest, 'Word feature window is asymmetrical'
self.window_size = highest
def unicode_to_ascii(self, unicode_string):
"""Convert unicode string to closest ASCII equivalent. Based on code found at:
https://stackoverflow.com/a/518232/2809427
:param unicode_string: String to convert to ASCII
:return: String with every character converted to most similar ASCII character.
"""
unicode_string = re.sub(u"\u2013", "-", unicode_string) # em dash
return ''.join(
character for character in unicodedata.normalize('NFD', unicode_string)
if unicodedata.category(character) != 'Mn'
and character in string.printable
)
def devectorize_tag(self, tag_indices):
"""Devectorize a list of tag indices using self.tag_to_index
:param tag_indices: List of tag indices.
:return: List of tags.
"""
to_tag = {y:x for x, y in self.tag_to_index.items()}
tags = [to_tag[index] for index in tag_indices]
return tags
def find_window_indices(self, token):
"""Get relative indices of window words. Avoids trying to access keys that don't exist.
:param token: Token the indexes are relative to.
:return: List of indices
"""
window = []
window_range = range(-self.window_size, self.window_size + 1)
for i in window_range:
test_key = 'text'
test_key = '%d:%s' % (i, test_key)
if test_key in token:
window.append(i)
return window
def one_hot(self, index_dictionary, value):
"""
Create a one-hot vector representation for discrete features that appear in the X_data
:param index_dictionary: A dictionary mapping discrete features to unique integers (ie the order
they appeared in the X_data; see self.create_feature_dictionary)
:param value: The discrete feature
:return: A one-hot vector for that discrete feature
"""
vector = [0.0] * len(index_dictionary)
if value in index_dictionary:
index = index_dictionary[value]
vector[index] = 1.0
return vector
def vectorize_tokens(self, tokens):
"""Vectorize list of tokens.
:param tokens: Tokens to vectorize.
:return: List of vectors.
"""
tokens_vector = []
for token in tokens:
token_vector = []
# Add text index for looking up word embedding
token_text = token['0:text']
token_text = self.unicode_to_ascii(token_text)
# Look up word embedding index
try:
embedding_index = self.word_vectors.vocab[token_text].index
except KeyError:
embedding_index = len(self.word_vectors.vocab)
# Only for logging untrained tokens
self.untrained_tokens.add(token_text)
token_vector.append(embedding_index)
# Add list of character indices as second item
character_indices = []
for character in token_text:
index = self.character_to_index[character]
character_indices.append(index)
# If there were no indices ex. special characters only
if not character_indices:
# Append the padding index
character_indices.append(0)
token_vector.append(character_indices)
# Find window indices
window = self.find_window_indices(token)
# Add features to vector in order
window_range = range(-self.window_size, self.window_size + 1)
other_feature_names = [key for key in self.other_features]
other_feature_names.sort()
for i in window_range:
if i in window:
for feature_name in other_feature_names:
key = '%d:%s' % (i, feature_name)
feature = token[key]
vector = self.one_hot(self.other_features[feature_name], feature)
token_vector.extend(vector)
else:
for feature in self.other_features:
vector = [0.0] * len(self.other_features[feature])
token_vector.extend(vector)
tokens_vector.append(token_vector)
return tokens_vector
def vectorize_tags(self, tags):
"""Convert list of tag names into their indices.
:param tags: List of tags to convert.
:return: Torch tensor of indices.
"""
indices = [self.tag_to_index[tag] for tag in tags]
return torch.tensor(indices, dtype=torch.long, device=self.device)
def vectorize_dataset(self, x_data, y_data):
"""Vectorize entire dataset.
:param x_data: Sequences.
:param y_data: True labels.
:return: Vectorized data.
"""
self.create_tag_dictionary(y_data)
# Find other feature names
self.find_other_features(x_data[0][0])
# Calculate window size
self.find_window_size(x_data)
# Create feature dictionaries
for feature in self.other_features:
self.other_features[feature] = self.create_feature_dictionary(feature, x_data)
# Vectorize data
sentences = []
correct_tags = []
for sentence, sentence_tags in zip(x_data, y_data):
tokens_vector = self.vectorize_tokens(sentence)
correct_tags_vector = self.vectorize_tags(sentence_tags)
sentences.append(tokens_vector)
correct_tags.append(correct_tags_vector)
data = list(zip(sentences, correct_tags))
return data
def get_values(self):
"""Get Vectorizer values so they can saved or migrated.
:return: Values to get.
"""
values = {
'tag_to_index': self.tag_to_index,
'character_to_index': self.character_to_index,
'untrained_tokens': self.untrained_tokens,
'window_size': self.window_size,
'other_features': self.other_features
}
return values
def load_values(self, values):
"""Load saved Vectorizer values into this object.
:param values: Values to load.
"""
self.tag_to_index = values['tag_to_index']
self.untrained_tokens = values['untrained_tokens']
self.character_to_index = values['character_to_index']
self.window_size = values['window_size']
try:
self.other_features = values['other_features']
except KeyError:
raise Exception('Tried to load deprecated Medacy model')