Diff of /utils.py [000000] .. [5c3b8b]

Switch to side-by-side view

--- a
+++ b/utils.py
@@ -0,0 +1,208 @@
+from __future__ import print_function
+from functools import reduce
+import tarfile
+import numpy as np
+import re
+import pdb
+import pickle
+import h5py
+from keras.preprocessing.sequence import pad_sequences
+from glove import Glove
+
+
+def check_repeated(name,repeated_list):
+    name = name.lower().strip()
+    return name if not (name in repeated_list) else repeated_list[name]
+
+def process_title(word):
+    return re.sub(r'\W+', ' ', word).strip().lower()
+
+def tokenize(sent):
+    '''Return the tokens of a sentence including punctuation.
+
+    >>> tokenize('Bob dropped the apple. Where is the apple?')
+    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
+    '''
+    return [x.strip().lower() for x in re.split('(\W+)?', sent) if x.strip()]
+
+
+def parse_stories(lines, only_supporting=False,repeated_list=None):
+    '''Parse stories
+
+    If only_supporting is true, only the sentences that support the answer are kept.
+    '''
+    data = []
+    story = []
+    for line in lines:
+        line = line.decode('utf-8').strip()
+        spl = line.split(' ', 1)
+        if len(spl) > 1:
+            nid, line = spl
+        else:
+            continue
+        try:
+            nid = int(nid)
+        except ValueError:
+            pdb.set_trace()
+        if nid == 0:
+            story = []
+        if '\t' in line:
+            supporting, a = line.split('\t')
+            a = map(process_title,a.split(','))
+            options = [] if len(a) == 1 else list(set(a[1:]))
+            a = a[0]    
+            substory = None
+            # Provide all the substories
+            if supporting:
+                story.append([tokenize(supporting) + [u'.']])
+            substory = [x for x in story if x]
+            # TODO: I should have done the lower in previous processing steps
+            if not substory:
+                continue
+            data.append((substory, a.lower(), map(lambda x:x.lower(),options)))
+        else:
+            sent = tokenize(line)
+            story.append([sent + [u'.']])
+    return data
+
+
+def get_stories(f, only_supporting=False, max_length=None,repeated_list=None, min_length=None):
+    '''Given a file name, read the file, retrieve the stories, and then convert the sentences into a single story.
+
+    If max_length is supplied, any stories longer than max_length tokens will be discarded.
+    '''
+    data = parse_stories(f.readlines(), only_supporting=only_supporting,repeated_list=repeated_list)
+    flatten = lambda data: reduce(lambda x, y: x + y, data)
+    data = [[flatten(reversed(story)), answer, options] for story,answer,options in data if not max_length or len(flatten(story)) < max_length]
+    # At least two facts
+    print(len(data))
+    if min_length: 
+        data = filter(lambda x: len(x[0]) > min_length, data)
+    print(len(data))
+    return data
+
+
+def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
+    X = []
+    Xq = []
+    Y = []
+    for story, query, answer in data:
+        x = [word_idx[w] for w in story]
+        xq = [word_idx[w] for w in query]
+        y = np.zeros(len(word_idx))  # let's not forget that index 0 is reserved
+        y[word_idx[answer]] = 1
+        X.append(x)
+        Xq.append(xq)
+        Y.append(y)
+    return (pad_sequences(X, maxlen=story_maxlen),
+            pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))
+
+def get_spacy_vectors(data, answer_dict, story_maxlen, model):
+    X = []
+    Y = []
+    for story,answer in data:
+        story = story[:story_maxlen] if len(story) > story_maxlen else story
+        x = [model(unicode(w)).vector for w in story]
+        X.append(x)
+        if not answer_dict is None:
+            y = np.zeros(len(answer_dict))
+            y[answer_dict[answer]] = 1
+            Y.append(y)
+    return (pad_sequences(X, maxlen=story_maxlen,dtype='float32'),
+            np.array(Y))
+
+def get_word_vectors(data, answer_dict, story_maxlen, model):
+    X = []
+    Y = []
+    for story,answer in data:
+        story = story[:story_maxlen] if len(story) > story_maxlen else story
+        x = [model.word_vectors[model.dictionary[w]] for w in story if 
+             not model.dictionary.get(w) is None]
+        X.append(x)
+        if not answer_dict is None:
+            y = np.zeros(len(answer_dict))
+            y[answer_dict[answer]] = 1
+            Y.append(y)
+    return (pad_sequences(X, maxlen=story_maxlen,dtype='float32'),
+            np.array(Y))
+     
+def create_vectors_dataset(input_files, vector_files, max_len=500):
+    print('Creating word vectors file')
+
+    training_set_file, test_set_file = input_files
+    train_word_file, test_word_file = vector_files
+    
+    train_stories = pickle.load(open(training_set_file,'r'))
+    test_stories = pickle.load(open(test_set_file,'r'))
+
+    train_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in train_stories]
+    test_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in test_stories]
+
+    vocab = sorted(reduce(lambda x, y: x | y, (set(story + [answer]) for story, answer in train_stories + test_stories)))
+
+    # Reserve 0 for masking via pad_sequences
+    vocab_size = len(vocab) + 1
+    story_maxlen = max(map(len, (x for x, _ in train_stories + test_stories)))
+
+
+    print('-')
+    print('Vocab size:', vocab_size, 'unique words')
+    print('Story max length:', story_maxlen, 'words')
+    print('Number of training stories:', len(train_stories))
+    print('Number of test stories:', len(test_stories))
+    print('-')
+    print('Here\'s what a "story" tuple looks like (input, query, answer):')
+    print(train_stories[0])
+    print('-')
+    print('Vectorizing the word sequences...')
+
+    word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
+
+    answer_vocab = sorted(reduce(lambda x, y: x | y, (set([answer]) for _, answer in train_stories + test_stories)))
+    # Reserve 0 for masking via pad_sequences
+    answer_dict = dict((word, i) for i, word in enumerate(answer_vocab))
+    print('Answers dict len: {0}'.format(len(answer_dict)))
+
+    # I need to check also if this exist
+    word_vectors_dir = 'word_vectors/glove.42B.300d.txt'
+    word_vectors_model = Glove.load_stanford(word_vectors_dir)
+
+    inputs_train, answers_train = get_word_vectors(train_stories, answer_dict, 
+                                                   max_len, word_vectors_model)
+    inputs_test, answers_test = get_word_vectors(test_stories, answer_dict, max_len,
+                                                 word_vectors_model)
+
+    with h5py.File(train_word_file,'w') as train_f:
+        _ = train_f.create_dataset('inputs',data=inputs_train)
+        _ = train_f.create_dataset('answers',data=answers_train)
+    with h5py.File(test_word_file,'w') as test_f:
+        _ = test_f.create_dataset('inputs',data=inputs_test)
+        _ = test_f.create_dataset('answers',data=answers_test)
+        
+    return (inputs_train, answers_train),(inputs_test, answers_test)
+
+def save_vectors_dict(input_files):
+
+
+    # I need to check also if this exist
+    filename = 'word_vectors/glove.42B.300d.txt'
+    word_vectors_dict = 'word_vectors/glove_dict.hdf5'
+    dct = {}
+    vectors = array.array('d')
+
+    # Read in the data.
+    with io.open(filename, 'r', encoding='utf-8') as savefile:
+        for i, line in enumerate(savefile):
+            tokens = line.split(' ')
+
+            word = tokens[0]
+            entries = tokens[1:]
+
+            dct[word] = i
+            vectors.extend(float(x) for x in entries)
+            
+    print('Saving to hf5 file')
+    with h5py.File(word_vectors_dict,'w') as vector_f:
+        _ = vector_f.create_dataset('vectors',data=dct)
+
+        
\ No newline at end of file