a b/utils.py
1
from __future__ import print_function
2
from functools import reduce
3
import tarfile
4
import numpy as np
5
import re
6
import pdb
7
import pickle
8
import h5py
9
from keras.preprocessing.sequence import pad_sequences
10
from glove import Glove
11
12
13
def check_repeated(name,repeated_list):
14
    name = name.lower().strip()
15
    return name if not (name in repeated_list) else repeated_list[name]
16
17
def process_title(word):
18
    return re.sub(r'\W+', ' ', word).strip().lower()
19
20
def tokenize(sent):
21
    '''Return the tokens of a sentence including punctuation.
22
23
    >>> tokenize('Bob dropped the apple. Where is the apple?')
24
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
25
    '''
26
    return [x.strip().lower() for x in re.split('(\W+)?', sent) if x.strip()]
27
28
29
def parse_stories(lines, only_supporting=False,repeated_list=None):
30
    '''Parse stories
31
32
    If only_supporting is true, only the sentences that support the answer are kept.
33
    '''
34
    data = []
35
    story = []
36
    for line in lines:
37
        line = line.decode('utf-8').strip()
38
        spl = line.split(' ', 1)
39
        if len(spl) > 1:
40
            nid, line = spl
41
        else:
42
            continue
43
        try:
44
            nid = int(nid)
45
        except ValueError:
46
            pdb.set_trace()
47
        if nid == 0:
48
            story = []
49
        if '\t' in line:
50
            supporting, a = line.split('\t')
51
            a = map(process_title,a.split(','))
52
            options = [] if len(a) == 1 else list(set(a[1:]))
53
            a = a[0]    
54
            substory = None
55
            # Provide all the substories
56
            if supporting:
57
                story.append([tokenize(supporting) + [u'.']])
58
            substory = [x for x in story if x]
59
            # TODO: I should have done the lower in previous processing steps
60
            if not substory:
61
                continue
62
            data.append((substory, a.lower(), map(lambda x:x.lower(),options)))
63
        else:
64
            sent = tokenize(line)
65
            story.append([sent + [u'.']])
66
    return data
67
68
69
def get_stories(f, only_supporting=False, max_length=None,repeated_list=None, min_length=None):
70
    '''Given a file name, read the file, retrieve the stories, and then convert the sentences into a single story.
71
72
    If max_length is supplied, any stories longer than max_length tokens will be discarded.
73
    '''
74
    data = parse_stories(f.readlines(), only_supporting=only_supporting,repeated_list=repeated_list)
75
    flatten = lambda data: reduce(lambda x, y: x + y, data)
76
    data = [[flatten(reversed(story)), answer, options] for story,answer,options in data if not max_length or len(flatten(story)) < max_length]
77
    # At least two facts
78
    print(len(data))
79
    if min_length: 
80
        data = filter(lambda x: len(x[0]) > min_length, data)
81
    print(len(data))
82
    return data
83
84
85
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
86
    X = []
87
    Xq = []
88
    Y = []
89
    for story, query, answer in data:
90
        x = [word_idx[w] for w in story]
91
        xq = [word_idx[w] for w in query]
92
        y = np.zeros(len(word_idx))  # let's not forget that index 0 is reserved
93
        y[word_idx[answer]] = 1
94
        X.append(x)
95
        Xq.append(xq)
96
        Y.append(y)
97
    return (pad_sequences(X, maxlen=story_maxlen),
98
            pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))
99
100
def get_spacy_vectors(data, answer_dict, story_maxlen, model):
101
    X = []
102
    Y = []
103
    for story,answer in data:
104
        story = story[:story_maxlen] if len(story) > story_maxlen else story
105
        x = [model(unicode(w)).vector for w in story]
106
        X.append(x)
107
        if not answer_dict is None:
108
            y = np.zeros(len(answer_dict))
109
            y[answer_dict[answer]] = 1
110
            Y.append(y)
111
    return (pad_sequences(X, maxlen=story_maxlen,dtype='float32'),
112
            np.array(Y))
113
114
def get_word_vectors(data, answer_dict, story_maxlen, model):
115
    X = []
116
    Y = []
117
    for story,answer in data:
118
        story = story[:story_maxlen] if len(story) > story_maxlen else story
119
        x = [model.word_vectors[model.dictionary[w]] for w in story if 
120
             not model.dictionary.get(w) is None]
121
        X.append(x)
122
        if not answer_dict is None:
123
            y = np.zeros(len(answer_dict))
124
            y[answer_dict[answer]] = 1
125
            Y.append(y)
126
    return (pad_sequences(X, maxlen=story_maxlen,dtype='float32'),
127
            np.array(Y))
128
     
129
def create_vectors_dataset(input_files, vector_files, max_len=500):
130
    print('Creating word vectors file')
131
132
    training_set_file, test_set_file = input_files
133
    train_word_file, test_word_file = vector_files
134
    
135
    train_stories = pickle.load(open(training_set_file,'r'))
136
    test_stories = pickle.load(open(test_set_file,'r'))
137
138
    train_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in train_stories]
139
    test_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in test_stories]
140
141
    vocab = sorted(reduce(lambda x, y: x | y, (set(story + [answer]) for story, answer in train_stories + test_stories)))
142
143
    # Reserve 0 for masking via pad_sequences
144
    vocab_size = len(vocab) + 1
145
    story_maxlen = max(map(len, (x for x, _ in train_stories + test_stories)))
146
147
148
    print('-')
149
    print('Vocab size:', vocab_size, 'unique words')
150
    print('Story max length:', story_maxlen, 'words')
151
    print('Number of training stories:', len(train_stories))
152
    print('Number of test stories:', len(test_stories))
153
    print('-')
154
    print('Here\'s what a "story" tuple looks like (input, query, answer):')
155
    print(train_stories[0])
156
    print('-')
157
    print('Vectorizing the word sequences...')
158
159
    word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
160
161
    answer_vocab = sorted(reduce(lambda x, y: x | y, (set([answer]) for _, answer in train_stories + test_stories)))
162
    # Reserve 0 for masking via pad_sequences
163
    answer_dict = dict((word, i) for i, word in enumerate(answer_vocab))
164
    print('Answers dict len: {0}'.format(len(answer_dict)))
165
166
    # I need to check also if this exist
167
    word_vectors_dir = 'word_vectors/glove.42B.300d.txt'
168
    word_vectors_model = Glove.load_stanford(word_vectors_dir)
169
170
    inputs_train, answers_train = get_word_vectors(train_stories, answer_dict, 
171
                                                   max_len, word_vectors_model)
172
    inputs_test, answers_test = get_word_vectors(test_stories, answer_dict, max_len,
173
                                                 word_vectors_model)
174
175
    with h5py.File(train_word_file,'w') as train_f:
176
        _ = train_f.create_dataset('inputs',data=inputs_train)
177
        _ = train_f.create_dataset('answers',data=answers_train)
178
    with h5py.File(test_word_file,'w') as test_f:
179
        _ = test_f.create_dataset('inputs',data=inputs_test)
180
        _ = test_f.create_dataset('answers',data=answers_test)
181
        
182
    return (inputs_train, answers_train),(inputs_test, answers_test)
183
184
def save_vectors_dict(input_files):
185
186
187
    # I need to check also if this exist
188
    filename = 'word_vectors/glove.42B.300d.txt'
189
    word_vectors_dict = 'word_vectors/glove_dict.hdf5'
190
    dct = {}
191
    vectors = array.array('d')
192
193
    # Read in the data.
194
    with io.open(filename, 'r', encoding='utf-8') as savefile:
195
        for i, line in enumerate(savefile):
196
            tokens = line.split(' ')
197
198
            word = tokens[0]
199
            entries = tokens[1:]
200
201
            dct[word] = i
202
            vectors.extend(float(x) for x in entries)
203
            
204
    print('Saving to hf5 file')
205
    with h5py.File(word_vectors_dict,'w') as vector_f:
206
        _ = vector_f.create_dataset('vectors',data=dct)
207
208