[349d16]: / code / dnc_code / tasks / babi_task_GPU.py

Download this file

352 lines (276 with data), 17.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
# bAbI question answering task for DNC
# Note: In this task, some functions for data processing are adapted from the Github User bgavran's implementation of DNC on Github
import os
import re
import torch
import pickle
from torch import nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import random
from DNC_GPU.dnc import DNC_Module # Importing DNC Implementation
class task_babi():
def __init__(self):
self.name = "bAbI_task_GPU"
self.controller_size = 128
self.controller_layers = 1
self.num_read_heads = 1
self.num_write_heads = 1
self.sequence_width = -1 # Length of each word
self.sequence_len = -1 # Word length of each story
self.memory_N = 128
self.memory_M = 128
self.num_batches = -1
self.num_epoch = 1
self.batch_size = 10
self.adam_lr = 1e-4
self.adam_betas = (0.9, 0.999)
self.adam_eps = 1e-8
self.machine = None
self.loss = None
self.optimizer = None
self.ind_to_word = None
self.data_dir = "Data/bAbI/en-10k" # Data directory
def get_task_name(self):
return self.name
def init_dnc(self):
if not os.path.isfile("Data/sequence_width.txt"):
self.read_data() # To set the sequence width
else:
self.sequence_width = pickle.load(open("Data/sequence_width.txt",'rb')) # To set the sequence width
self.machine = DNC_Module(self.sequence_width, self.sequence_width, self.controller_size, self.controller_layers, self.num_read_heads, self.num_write_heads, self.memory_N, self.memory_M)
self.machine.cuda() # Enabling GPU
def init_loss(self):
self.loss = nn.CrossEntropyLoss(reduction = 'none').cuda() # Cross Entropy Loss -> Sigmoid Activation + Cross Entropy Loss
def init_optimizer(self):
self.optimizer = optim.Adam(self.machine.parameters(), lr = self.adam_lr, betas = self.adam_betas, eps = self.adam_eps)
def calc_loss(self, Y_pred, Y, mask):
# Y: dim -> (sequence_len x batch_size)
# Y_pred: dim -> (sequence_len x batch_size x sequence_width)
# mask: dim -> (sequence_len x batch_size)
loss_vec = torch.empty(Y.shape, dtype=torch.float32).cuda()
for i in range(Y_pred.shape[0]):
loss_vec[i, :] = self.loss(Y_pred[i], Y[i])
return torch.sum(loss_vec*mask)/torch.sum(mask)
def calc_cost(self, Y_pred, Y, mask): # Calculates % Cost
# Y: dim -> (sequence_len x batch_size)
# Y_pred: dim -> (sequence_len x batch_size x sequence_width)
# mask: dim -> (sequence_len x batch_size)
Y_pred, Y, mask = Y_pred.cpu(), Y.cpu(), mask.cpu()
return torch.sum(((F.softmax(Y_pred, dim=2).max(2)[1]) == Y).type(torch.long)*mask.type(torch.long)).item(), torch.sum(mask).item()
def print_word(self, word_vec): # Prints the word from word vector
# "word_vect" dimension : (1 x sequence_width)
idx = np.argmax(word_vec, axis = 1)
word = self.ind_to_word[idx]
print(word + "\n")
def to_one_hot(self, story): # Converts a vector into one hot form
out_token = []
I = np.eye(self.sequence_width)
for idx in story:
out_token.append(I[int(idx)])
if len(out_token)>self.sequence_len:
self.sequence_len = len(out_token)
return out_token
def padding_labels(self, stories): # Making separate funcion to pad labels because, labels will not be in one-hot vector form due to the requirements of PyTorch Cross Entropy Loss Function
padded_stories = []
for story in stories:
if len(story)<self.sequence_len:
li = [1 for i in range(self.sequence_len - len(story))]
story.extend(li)
padded_stories.append(np.asarray(story, dtype = np.long))
return padded_stories
def padding(self, stories): # Pads padding element to make all the stories of equal length
padded_stories = []
for story in stories:
if len(story)<self.sequence_len:
li = self.to_one_hot(np.ones(self.sequence_len - len(story)))
story.extend(li)
padded_stories.append(np.asarray(story, dtype = np.float32))
return padded_stories
def flatten_if_list(self, l): # Merges all the list within a list with the outer list elements. Example: [you', '?', ['-']] -> ['you', '?', '-']
newl = []
for elem in l:
if isinstance(elem, list): # Checking whether the element is 'list' or not
newl.extend(elem) # input.extend(li_2) method appends all the elements of 'li_2' list into 'input' list
else:
newl.append(elem)
return newl
def structure_data(self, x, y): # Prepares data for bAbI task
# Preparing Data
keys = list(x.keys())
random.shuffle(keys) # Randomly Shuffling the key list
inp_story = []
out_story = []
for key in keys:
inp_story.extend(x[key])
out_story.extend(y[key])
story_idx = list(range(0, len(inp_story)))
random.shuffle(story_idx)
# Here I am breaking stories into different files because A single list can't store all the stories
num_batch = int(len(story_idx)/self.batch_size)
self.num_batches = num_batch
counter = 1
# Out Data
x_out = []
y_out = []
mask_inp = [] # Will be used for making the mask to make "non amswer" output words from DNC irrelevent
for i in story_idx:
if num_batch <= 0:
break
x_out.append(self.to_one_hot(inp_story[i]))
y_out.append(out_story[i])
mask_inp.append(inp_story[i]) # Appending input story For making the mask
if counter % self.batch_size == 0:
# Resetting Counter
counter = 0
# Padding
x_out_array = torch.tensor(np.asarray(self.padding(x_out)).swapaxes(0, 1)) # Converting from (batch_size x story_length x word size) to (story_length x batch_size x word size)
y_out_array = torch.tensor(np.asarray(self.padding_labels(y_out)).swapaxes(0, 1), dtype=torch.long) # Converting from (batch_size x story_length x word size) to (story_length x batch_size x word size)
m_inp_array = torch.tensor(np.asarray(self.padding_labels(mask_inp)).swapaxes(0, 1), dtype=torch.long) # Converting from (batch_size x story_length x word size) to (story_length x batch_size x word size)
# Renewing List and updating batch number
x_out = []
y_out = []
mask_inp = []
num_batch -= 1
yield (self.num_batches - num_batch), x_out_array, y_out_array, (m_inp_array == 0).float()
counter += 1
def read_data(self): # Reading and Cleaning data from the file
storage_file = "Data/cleaned_data_bAbI_" + self.data_dir.split('/')[2] +".txt"
if not os.path.isfile(storage_file):
output_symbol = "-" # Indicates an expectation of output to the DNC
newstory_delimiter = " NEWSTORY " # To separate stories
pad_symbol = "*" # Padding symbol
file_paths = []
word_to_ind = {output_symbol: 0, pad_symbol: 1} # Dictionary to store indices of all the word in the bAbI dataset. Predefined symbols already stored
all_input_stories = {}
all_output_stories = {}
# Making list of all the files in the data directory
for f in os.listdir(self.data_dir):
f_path = os.path.join(self.data_dir, f)
if os.path.isfile(f_path):
file_paths.append(f_path)
# Processing the text files
for file_path in file_paths:
# print(file_path)
# Cleaning the text
file = open(file_path).read().lower()
file = re.sub("\n1 ", newstory_delimiter, file) # Adding a delimeter between two stories
file = re.sub(r"\d+|\n|\t", " ", file) # Removing all numbers, newlines and tabs
file = re.sub("([?.])", r" \1", file) # Adding a space before all punctuations
stories = file.split(newstory_delimiter) # Splitting whole text into the stories
input_stories = [] # Stores the stories into the index form, where each word has unique index
output_stories = []
# Tokenizing the text
for i, story in enumerate(stories):
input_tokens = story.split() # Input stories are meant for inputting to the DNC
output_tokens = story.split() # Output stories works as labels
for i, token in enumerate(input_tokens): # This when encountered "?", replaces answers with "-" sign in the input for the SNC
if token == "?":
output_tokens[i + 1] = output_tokens[i + 1].split(",")
input_tokens[i + 1] = [output_symbol for _ in range(len(output_tokens[i + 1]))]
input_tokens = self.flatten_if_list(input_tokens)
output_tokens = self.flatten_if_list(output_tokens)
# Calculating index of all the words
for token in output_tokens:
if token not in word_to_ind:
word_to_ind[token] = len(word_to_ind)
input_stories.append([word_to_ind[elem] for elem in input_tokens]) # Storing each story into a list of word indices form
output_stories.append([word_to_ind[elem] for elem in output_tokens])
all_input_stories[file_path] = input_stories # Storing all the stories for each file
all_output_stories[file_path] = output_stories
# Dumping all the cleaned data into a file
pickle.dump((word_to_ind, all_input_stories, all_output_stories),open(storage_file,'wb'))
pickle.dump(len(word_to_ind),open("Data/sequence_width.txt",'wb'))
self.sequence_width = len(word_to_ind) # Vector length of one hot vector
else:
word_to_ind, all_input_stories, all_output_stories = pickle.load(open(storage_file,'rb'))
return word_to_ind, all_input_stories, all_output_stories
def get_training_data(self): # Data directory
word_to_ind, all_input_stories, all_output_stories = self.read_data()
self.ind_to_word = {ind: word for word, ind in word_to_ind.items()} # Reverse Index to Word dictionary to show final output
# Separating Test and Train Data
x_train_stories = {k: v for k, v in all_input_stories.items() if k[-9:] == "train.txt"}
y_train_stories = {k: v for k, v in all_output_stories.items() if k[-9:] == "train.txt"}
return self.structure_data(x_train_stories, y_train_stories) # dim: x_train, y_train -> A list of (sequence_len x sequence_width) sized stories
def get_test_data(self): # Sample data for Testing # Data directory
_, all_input_stories, all_output_stories = self.read_data()
# Separating Test and Train Data
x_test_stories = {k: v for k, v in all_input_stories.items() if k[-8:] == "test.txt"}
y_test_stories = {k: v for k, v in all_output_stories.items() if k[-8:] == "test.txt"}
return self.structure_data(x_test_stories, y_test_stories) # dim: x_test, y_test -> A list of (sequence_len x sequence_width) sized stories
def test_model(self): # Testing the model
correct = 0
total = 0
print("\n")
for batch_num, X, Y, mask in self.get_test_data():
self.machine.initialization(self.batch_size) # Initializing states
Y_out = torch.zeros(X.shape)
# Feeding the DNC network all the data first and then predicting output
# by giving zero vector as input and previous read states and hidden vector
# and thus training vector this way to give outputs matching the labels
X, Y, mask = X.cuda(), Y.cuda(), mask.cuda() # Sending to CUDA device
embeddings = self.machine.backward_prediction(X) # Creating embeddings from data for backward calculation
temp_size = X.shape[0]
for i in range(temp_size):
Y_out[i, :, :], _ = self.machine(X[i], embeddings[temp_size-i-1])
corr, tot = self.calc_cost(Y_out, Y, mask)
correct += corr
total += tot
print("Test Example " + str(batch_num) + "/" + str(self.num_batches) + " processed, Batch Accuracy: " + str((float(corr)/float(tot))*100.0) + " %")
accuracy = (float(correct)/float(total))*100.0
print("\nOverall Accuracy: " + str(accuracy) + " %")
return accuracy # in %
def clip_grads(self): # Clipping gradients for stability
"""Gradient clipping to the range [10, 10]."""
parameters = list(filter(lambda p: p.grad is not None, self.machine.parameters()))
for p in parameters:
p.grad.data.clamp_(-10, 10)
def train_model(self):
# Here, the model is optimized using Cross Entropy Loss, however, it is evaluated using Number of error bits in predction and actual labels (cost)
loss_list = []
seq_length = []
save_batch = 500
last_batch = 0
for j in range(self.num_epoch):
for batch_num, X, Y, mask in self.get_training_data():
self.optimizer.zero_grad() # Making old gradients zero before calculating the fresh ones
self.machine.initialization(self.batch_size) # Initializing states
Y_out = torch.zeros(X.shape).cuda()
# Feeding the DNC network all the data first and then predicting output
# by giving zero vector as input and previous read states and hidden vector
# and thus training vector this way to give outputs matching the labels
X, Y, mask = X.cuda(), Y.cuda(), mask.cuda() # Sending to CUDA device
embeddings = self.machine.backward_prediction(X) # Creating embeddings from data for backward calculation
temp_size = X.shape[0]
for i in range(temp_size):
Y_out[i, :, :], _ = self.machine(X[i], embeddings[temp_size-i-1]) # Passing Embeddings from backwards
loss = self.calc_loss(Y_out, Y, mask)
loss.backward()
self.clip_grads()
self.optimizer.step()
loss_list += [loss.item()]
seq_length += [Y.shape[0]]
if (batch_num % save_batch) == 0:
self.save_model(j, batch_num)
last_batch = batch_num
print("Epoch: " + str(j) + "/" + str(self.num_epoch) + ", Batch: " + str(batch_num) + "/" + str(self.num_batches) + ", Loss: " + str(loss.item()))
self.save_model(j, last_batch)
def save_model(self, curr_epoch, curr_batch):
# Here 'start_epoch' and 'start_batch' params below are the 'epoch' and 'batch' number from which to start training after next model loading
# Note: It is recommended to start from the 'start_epoch' and not 'start_epoch' + 'start_batch', because batches are formed randomly
if not os.path.exists("Saved_Models/" + self.name):
os.mkdir("Saved_Models/" + self.name)
state_dic = {'task_name': self.name, 'start_epoch': curr_epoch + 1, 'start_batch': curr_batch + 1, 'state_dict': self.machine.state_dict(), 'optimizer_dic' : self.optimizer.state_dict()}
filename = "Saved_Models/" + self.name + "/" + self.name + "_" + str(curr_epoch) + "_" + str(curr_batch) + "_saved_model.pth.tar"
torch.save(state_dic, filename)
def load_model(self, option, epoch, batch):
path = "Saved_Models/" + self.name + "/" + self.name + "_" + str(epoch) + "_" + str(batch) + "_saved_model.pth.tar"
if option == 1: # Loading for training
checkpoint = torch.load(path)
self.machine.load_state_dict(checkpoint['state_dict'])
self.optimizer.load_state_dict(checkpoint['optimizer_dic'])
else: # Loading for testing
checkpoint = torch.load(path)
self.machine.load_state_dict(checkpoint['state_dict'])
self.machine.eval()