--- a +++ b/utils.py @@ -0,0 +1,208 @@ +from __future__ import print_function +from functools import reduce +import tarfile +import numpy as np +import re +import pdb +import pickle +import h5py +from keras.preprocessing.sequence import pad_sequences +from glove import Glove + + +def check_repeated(name,repeated_list): + name = name.lower().strip() + return name if not (name in repeated_list) else repeated_list[name] + +def process_title(word): + return re.sub(r'\W+', ' ', word).strip().lower() + +def tokenize(sent): + '''Return the tokens of a sentence including punctuation. + + >>> tokenize('Bob dropped the apple. Where is the apple?') + ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?'] + ''' + return [x.strip().lower() for x in re.split('(\W+)?', sent) if x.strip()] + + +def parse_stories(lines, only_supporting=False,repeated_list=None): + '''Parse stories + + If only_supporting is true, only the sentences that support the answer are kept. + ''' + data = [] + story = [] + for line in lines: + line = line.decode('utf-8').strip() + spl = line.split(' ', 1) + if len(spl) > 1: + nid, line = spl + else: + continue + try: + nid = int(nid) + except ValueError: + pdb.set_trace() + if nid == 0: + story = [] + if '\t' in line: + supporting, a = line.split('\t') + a = map(process_title,a.split(',')) + options = [] if len(a) == 1 else list(set(a[1:])) + a = a[0] + substory = None + # Provide all the substories + if supporting: + story.append([tokenize(supporting) + [u'.']]) + substory = [x for x in story if x] + # TODO: I should have done the lower in previous processing steps + if not substory: + continue + data.append((substory, a.lower(), map(lambda x:x.lower(),options))) + else: + sent = tokenize(line) + story.append([sent + [u'.']]) + return data + + +def get_stories(f, only_supporting=False, max_length=None,repeated_list=None, min_length=None): + '''Given a file name, read the file, retrieve the stories, and then convert the sentences into a single story. + + If max_length is supplied, any stories longer than max_length tokens will be discarded. + ''' + data = parse_stories(f.readlines(), only_supporting=only_supporting,repeated_list=repeated_list) + flatten = lambda data: reduce(lambda x, y: x + y, data) + data = [[flatten(reversed(story)), answer, options] for story,answer,options in data if not max_length or len(flatten(story)) < max_length] + # At least two facts + print(len(data)) + if min_length: + data = filter(lambda x: len(x[0]) > min_length, data) + print(len(data)) + return data + + +def vectorize_stories(data, word_idx, story_maxlen, query_maxlen): + X = [] + Xq = [] + Y = [] + for story, query, answer in data: + x = [word_idx[w] for w in story] + xq = [word_idx[w] for w in query] + y = np.zeros(len(word_idx)) # let's not forget that index 0 is reserved + y[word_idx[answer]] = 1 + X.append(x) + Xq.append(xq) + Y.append(y) + return (pad_sequences(X, maxlen=story_maxlen), + pad_sequences(Xq, maxlen=query_maxlen), np.array(Y)) + +def get_spacy_vectors(data, answer_dict, story_maxlen, model): + X = [] + Y = [] + for story,answer in data: + story = story[:story_maxlen] if len(story) > story_maxlen else story + x = [model(unicode(w)).vector for w in story] + X.append(x) + if not answer_dict is None: + y = np.zeros(len(answer_dict)) + y[answer_dict[answer]] = 1 + Y.append(y) + return (pad_sequences(X, maxlen=story_maxlen,dtype='float32'), + np.array(Y)) + +def get_word_vectors(data, answer_dict, story_maxlen, model): + X = [] + Y = [] + for story,answer in data: + story = story[:story_maxlen] if len(story) > story_maxlen else story + x = [model.word_vectors[model.dictionary[w]] for w in story if + not model.dictionary.get(w) is None] + X.append(x) + if not answer_dict is None: + y = np.zeros(len(answer_dict)) + y[answer_dict[answer]] = 1 + Y.append(y) + return (pad_sequences(X, maxlen=story_maxlen,dtype='float32'), + np.array(Y)) + +def create_vectors_dataset(input_files, vector_files, max_len=500): + print('Creating word vectors file') + + training_set_file, test_set_file = input_files + train_word_file, test_word_file = vector_files + + train_stories = pickle.load(open(training_set_file,'r')) + test_stories = pickle.load(open(test_set_file,'r')) + + train_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in train_stories] + test_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in test_stories] + + vocab = sorted(reduce(lambda x, y: x | y, (set(story + [answer]) for story, answer in train_stories + test_stories))) + + # Reserve 0 for masking via pad_sequences + vocab_size = len(vocab) + 1 + story_maxlen = max(map(len, (x for x, _ in train_stories + test_stories))) + + + print('-') + print('Vocab size:', vocab_size, 'unique words') + print('Story max length:', story_maxlen, 'words') + print('Number of training stories:', len(train_stories)) + print('Number of test stories:', len(test_stories)) + print('-') + print('Here\'s what a "story" tuple looks like (input, query, answer):') + print(train_stories[0]) + print('-') + print('Vectorizing the word sequences...') + + word_idx = dict((c, i + 1) for i, c in enumerate(vocab)) + + answer_vocab = sorted(reduce(lambda x, y: x | y, (set([answer]) for _, answer in train_stories + test_stories))) + # Reserve 0 for masking via pad_sequences + answer_dict = dict((word, i) for i, word in enumerate(answer_vocab)) + print('Answers dict len: {0}'.format(len(answer_dict))) + + # I need to check also if this exist + word_vectors_dir = 'word_vectors/glove.42B.300d.txt' + word_vectors_model = Glove.load_stanford(word_vectors_dir) + + inputs_train, answers_train = get_word_vectors(train_stories, answer_dict, + max_len, word_vectors_model) + inputs_test, answers_test = get_word_vectors(test_stories, answer_dict, max_len, + word_vectors_model) + + with h5py.File(train_word_file,'w') as train_f: + _ = train_f.create_dataset('inputs',data=inputs_train) + _ = train_f.create_dataset('answers',data=answers_train) + with h5py.File(test_word_file,'w') as test_f: + _ = test_f.create_dataset('inputs',data=inputs_test) + _ = test_f.create_dataset('answers',data=answers_test) + + return (inputs_train, answers_train),(inputs_test, answers_test) + +def save_vectors_dict(input_files): + + + # I need to check also if this exist + filename = 'word_vectors/glove.42B.300d.txt' + word_vectors_dict = 'word_vectors/glove_dict.hdf5' + dct = {} + vectors = array.array('d') + + # Read in the data. + with io.open(filename, 'r', encoding='utf-8') as savefile: + for i, line in enumerate(savefile): + tokens = line.split(' ') + + word = tokens[0] + entries = tokens[1:] + + dct[word] = i + vectors.extend(float(x) for x in entries) + + print('Saving to hf5 file') + with h5py.File(word_vectors_dict,'w') as vector_f: + _ = vector_f.create_dataset('vectors',data=dct) + + \ No newline at end of file