Diff of /finetune.py [000000] .. [2b4aea]

Switch to unified view

a b/finetune.py
1
""" Code for the MAML algorithm and network architecture. """
2
import numpy as np
3
import sklearn
4
import tensorflow as tf
5
import os, time, shutil, collections
6
7
from tensorflow.contrib import rnn
8
import tensorflow.contrib.layers as layers
9
from tensorflow.contrib.rnn import RNNCell
10
11
from tensorflow.python.platform import flags
12
13
14
FLAGS = flags.FLAGS
15
16
PADDING_ID = 1016
17
WORDS_NUM = 1017
18
MASK_ARRAY = [[1.]] * PADDING_ID + [[0.]] + [[1.]] * (WORDS_NUM - PADDING_ID - 1)
19
20
SUMMARY_INTERVAL = 100
21
SAVE_INTERVAL = 1000
22
PRINT_INTERVAL = 100
23
TEST_PRINT_INTERVAL = PRINT_INTERVAL*5
24
25
26
class BaseModel(object):
27
    """
28
    Base Model for basic networks with sequential data, i.e., RNN, CNN.
29
    """
30
    def __init__(self):
31
        self.regularizers = []
32
        self.regularization = 0.01
33
        self.isReg = True
34
35
36
    def evaluate(self, data, labels, sess=None, prefix="metatest_"):
37
        """
38
        Runs one evaluation against the full epoch of data.
39
        Return the precision and the number of correct predictions.
40
        Batch evaluation saves memory and enables this to run on smaller GPUs.
41
        sess: the session in which the model has been trained.
42
        op: the Tensor that returns the number of correct predictions.
43
        """
44
        t_process, t_wall = time.process_time(), time.time()
45
        predictions, loss = self.predict(data, labels, sess)
46
47
        fpr, tpr, _ = sklearn.metrics.roc_curve(labels, predictions)
48
        auc = 100 * sklearn.metrics.auc(fpr, tpr)
49
        ncorrects = sum(predictions == labels)
50
        accuracy = 100 * sklearn.metrics.accuracy_score(labels, predictions)
51
        string = 'auc: {:.2f}, accuracy: {:.2f} ({:d} / {:d}), loss: {:.2e}'.format(auc, accuracy, ncorrects, len(labels), loss)
52
53
        if sess is None:
54
            string += '\ntime: {:.0f}s (wall {:.0f}s)'.format(time.process_time()-t_process, time.time()-t_wall)
55
        # return string, auc, loss, predictions
56
        return string, auc, accuracy, loss, predictions
57
58
    def fit(self, X_tr, y_tr, X_vl, y_vl):
59
60
        t_process, t_wall = time.process_time(), time.time()
61
        sess = tf.Session(graph=self.graph)
62
        shutil.rmtree(self._get_path('summaries'), ignore_errors=True)
63
        writer = tf.summary.FileWriter(self._get_path('summaries'), self.graph)
64
        shutil.rmtree(self._get_path('checkpoints'), ignore_errors=True)
65
        os.makedirs(self._get_path('checkpoints'))
66
        path = os.path.join(self._get_path('checkpoints'), 'model')
67
        sess.run(self.op_init)
68
69
        # Training.
70
        count = 0
71
        bad_counter = 0
72
        accuracies = []
73
        aucs = []
74
        losses = []
75
        indices = collections.deque()
76
        num_steps = int(self.num_epochs * X_tr.shape[0] / self.batch_size)
77
        estop = False  # early stop
78
        if type(X_vl) is not np.ndarray:
79
            X_vl = X_vl.toarray()
80
81
        for step in range(1, num_steps+1):
82
83
            # Be sure to have used all the samples before using one a second time.
84
            if len(indices) < self.batch_size:
85
                indices.extend(np.random.permutation(X_tr.shape[0]))
86
            idx = [indices.popleft() for i in range(self.batch_size)]
87
            count += len(idx)
88
89
            batch_data, batch_labels = X_tr[idx, :, :], y_tr[idx]
90
            if type(batch_data) is not np.ndarray:
91
                batch_data = batch_data.toarray()  # convert sparse matrices
92
            feed_dict = {self.ph_data: batch_data, self.ph_labels: batch_labels, self.ph_dropout: self.dropout, self.ph_training: True}
93
94
            learning_rate, loss_average = sess.run([self.op_train, self.op_loss_average], feed_dict)
95
96
            # Periodical evaluation of the model.
97
            if step % self.eval_frequency == 0 or step == num_steps:
98
                print ('Seen samples: %d' % count)
99
                epoch = step * self.batch_size / X_tr.shape[0]
100
                print('step {} / {} (epoch {:.2f} / {}):'.format(step, num_steps, epoch, self.num_epochs))
101
                print('  learning_rate = {:.2e}, loss_average = {:.2e}'.format(learning_rate, loss_average))
102
                string, auc, accuracy, loss, predictions = self.evaluate(X_vl, y_vl, sess)
103
                aucs.append(auc)
104
                accuracies.append(accuracy)
105
                losses.append(loss)
106
                print('  validation {}'.format(string))
107
                print('  time: {:.0f}s (wall {:.0f}s)'.format(time.process_time()-t_process, time.time()-t_wall))
108
109
                # Summaries for TensorBoard.
110
                summary = tf.Summary()
111
                summary.ParseFromString(sess.run(self.op_summary, feed_dict))
112
                summary.value.add(tag='validataion/auc', simple_value=auc)
113
                summary.value.add(tag='validation/loss', simple_value=loss)
114
                writer.add_summary(summary, step)
115
116
                # Save model parameters (for evaluation).
117
                self.op_saver.save(sess, path, global_step=step)
118
119
                if len(aucs) > (self.patience+5) and auc > np.array(aucs).max():
120
                    bad_counter = 0
121
122
                if len(aucs) > (self.patience+5) and auc <= np.array(aucs)[:-self.patience].max():
123
                    bad_counter += 1
124
                    if bad_counter > self.patience:
125
                        print('Early Stop!')
126
                        estop = True
127
                        break
128
            if estop:
129
                break
130
        print('validation accuracy: peak = {:.2f}, mean = {:.2f}'.format(max(accuracies), np.mean(accuracies[-10:])))
131
        print('validation auc: peak = {:.2f}, mean = {:.2f}'.format(max(aucs), np.mean(aucs[-10:])))
132
133
        # store weights value for fine-tune
134
        if self.is_finetune is not True:
135
            feed_dict = {}
136
            for k in self.op_weights:
137
                self.weights_for_init[k] = sess.run([self.op_weights[k]], feed_dict)[0]
138
                self.weights_for_finetune[k] = sess.run([self.op_weights[k]], feed_dict)[0]
139
140
        writer.close()
141
        sess.close()
142
        t_step = (time.time() - t_wall) / num_steps
143
        return  sess, aucs, accuracies
144
145
146
    def loss(self, logits):
147
        # Define loss and optimizer
148
        with tf.name_scope('cross_entropy'):
149
            labels = tf.to_int64(self.ph_labels)
150
            cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
151
            cross_entropy = tf.reduce_mean(cross_entropy)
152
        if self.is_finetune and self.freeze_opt == 'mlp':
153
            loss = cross_entropy
154
            # Summaries for TensorBoard.
155
            tf.summary.scalar('loss/cross_entropy', cross_entropy)
156
            tf.summary.scalar('loss/total', loss)
157
            with tf.name_scope('averages'):
158
                averages = tf.train.ExponentialMovingAverage(0.9)
159
                op_averages = averages.apply([cross_entropy, loss])
160
                tf.summary.scalar('loss/avg/cross_entropy', averages.average(cross_entropy))
161
                tf.summary.scalar('loss/avg/total', averages.average(loss))
162
                with tf.control_dependencies([op_averages]):
163
                    loss_average = tf.identity(averages.average(loss), name='control')
164
        else:
165
            with tf.name_scope('regularization'):
166
                regularization = self.regularization
167
                regularization *= tf.add_n(self.regularizers)
168
            loss = cross_entropy + regularization
169
170
            # Summaries for TensorBoard.
171
            tf.summary.scalar('loss/cross_entropy', cross_entropy)
172
            tf.summary.scalar('loss/regularization', regularization)
173
            tf.summary.scalar('loss/total', loss)
174
175
            with tf.name_scope('averages'):
176
                averages = tf.train.ExponentialMovingAverage(0.9)
177
                op_averages = averages.apply([cross_entropy, regularization, loss])
178
                tf.summary.scalar('loss/avg/cross_entropy', averages.average(cross_entropy))
179
                tf.summary.scalar('loss/avg/regularization', averages.average(regularization))
180
                tf.summary.scalar('loss/avg/total', averages.average(loss))
181
                with tf.control_dependencies([op_averages]):
182
                    loss_average = tf.identity(averages.average(loss), name='control')
183
        return loss, loss_average
184
185
    def predict(self, data, labels=None, sess=None):
186
        loss = 0
187
        size = data.shape[0]
188
        predictions = np.empty(size)
189
        sess = self._get_session(sess)
190
        for begin in range(0, size, self.batch_size):
191
            end = begin + self.batch_size
192
            end = min([end, size])
193
194
            batch_data = np.zeros((self.batch_size, data.shape[1], data.shape[2]))
195
            tmp_data = data[begin:end, :, :]
196
197
            if type(tmp_data) is not np.ndarray:
198
                tmp_data = tmp_data.toarray()  # convert sparse matrices
199
            batch_data[:end-begin] = tmp_data
200
            feed_dict = {self.ph_data: batch_data, self.ph_dropout: 1, self.ph_training: False}
201
202
            # Compute loss if labels are given.
203
            if labels is not None:
204
                batch_labels = np.zeros(self.batch_size)
205
                batch_labels[:end-begin] = labels[begin:end]
206
                feed_dict[self.ph_labels] = batch_labels
207
                batch_pred, batch_loss = sess.run([self.op_prediction, self.op_loss], feed_dict)
208
                loss += batch_loss
209
            else:
210
                batch_pred = sess.run(self.op_prediction, feed_dict)
211
212
            predictions[begin:end] = batch_pred[:end-begin]
213
214
        if labels is not None:
215
            return predictions, loss * self.batch_size / size
216
        else:
217
            return predictions
218
219
    def training(self, loss, learning_rate, decay_steps, decay_rate=0.95, momentum=0.9):
220
        """Adds to the loss model the Ops required to generate and apply gradients."""
221
        with tf.name_scope('training'):
222
            # Learning rate.
223
            global_step = tf.Variable(0, name='global_step', trainable=False)
224
            if decay_rate != 1:
225
                learning_rate = tf.train.exponential_decay(
226
                        learning_rate, global_step, decay_steps, decay_rate, staircase=True)
227
            tf.summary.scalar('learning_rate', learning_rate)
228
            # Optimizer.
229
            if momentum == 0:
230
                optimizer = tf.train.GradientDescentOptimizer(learning_rate)
231
            else:
232
                optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
233
            grads = optimizer.compute_gradients(loss)
234
            op_gradients = optimizer.apply_gradients(grads, global_step=global_step)
235
            # Histograms.
236
            for grad, var in grads:
237
                if grad is None:
238
                    print('warning: {} has no gradient'.format(var.op.name))
239
                else:
240
                    tf.summary.histogram(var.op.name + '/gradients', grad)
241
            # The op return the learning rate.
242
            with tf.control_dependencies([op_gradients]):
243
                op_train = tf.identity(learning_rate, name='control')
244
        return op_train
245
246
     # Helper methods.
247
    def _get_path(self, folder):
248
        path = '../../models/'
249
        return os.path.join(path, folder, self.dir_name)
250
251
    def _get_session(self, sess=None):
252
        """Restore parameters if no session given."""
253
        if sess is None:
254
            sess = tf.Session(graph=self.graph)
255
            filename = tf.train.latest_checkpoint(self._get_path('checkpoints'))
256
            self.op_saver.restore(sess, filename)
257
        return sess
258
259
    def _get_prediction(self, logits):
260
        """Return the predicted classes."""
261
        with tf.name_scope('prediction'):
262
            prediction = tf.argmax(logits, axis=1)
263
            return prediction
264
265
     # Helper methods.
266
    def _get_path(self, folder):
267
        path = '../../models/'
268
        return os.path.join(path, folder, self.dir_name)
269
270
    def _get_session(self, sess=None):
271
        """Restore parameters if no session given."""
272
        if sess is None:
273
            sess = tf.Session(graph=self.graph)
274
            filename = tf.train.latest_checkpoint(self._get_path('checkpoints'))
275
            self.op_saver.restore(sess, filename)
276
        return sess
277
278
279
    def weight_variable(self, shape, name='weights'):
280
        initial = tf.truncated_normal_initializer(0, 0.1)
281
        var = tf.get_variable(name, shape, tf.float32, initializer=initial)
282
283
        if self.isReg:
284
            self.regularizers.append(tf.nn.l2_loss(var))
285
        tf.summary.histogram(var.op.name, var)
286
        return var
287
288
    def bias_variable(self, shape, name='bias'):
289
        initial = tf.constant_initializer(0.1)
290
        var = tf.get_variable(name, shape, tf.float32, initializer=initial)
291
292
        if self.isReg:
293
            self.regularizers.append(tf.nn.l2_loss(var))
294
        tf.summary.histogram(var.op.name, var)
295
        return var
296
297
    def build_fc_weights(self, dim_in, weights):
298
        for i, dim in enumerate(self.dim_hidden):
299
            dim_out = dim
300
            weights["fc_W"+str(i)] = self.weight_variable([int(dim_in), dim_out], name="fc_W"+str(i))
301
            weights["fc_b"+str(i)] = self.bias_variable([dim_out], name="fc_b"+str(i))
302
            dim_in = dim_out
303
        return weights
304
305
    def fc(self, x, W, b, relu=True):
306
        """Fully connected layer with Mout features."""
307
        x = tf.matmul(x, W) + b
308
        return tf.nn.relu(x) if relu else x
309
310
    def normalize(self, inputs, epsilon = 1e-8, scope="ln", reuse=None):
311
        '''Applies layer normalization.
312
313
        Args:
314
          inputs: A tensor with 2 or more dimensions, where the first dimension has
315
            `batch_size`.
316
          epsilon: A floating number. A very small number for preventing ZeroDivision Error.
317
          scope: Optional scope for `variable_scope`.
318
          reuse: Boolean, whether to reuse the weights of a previous layer
319
            by the same name.
320
321
        Returns:
322
          A tensor with the same shape and data dtype as `inputs`.
323
        '''
324
        with tf.variable_scope(scope, reuse=reuse):
325
            inputs_shape = inputs.get_shape()
326
            params_shape = inputs_shape[-1:]
327
328
            mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
329
            beta= tf.Variable(tf.zeros(params_shape))
330
            gamma = tf.Variable(tf.ones(params_shape))
331
            normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
332
            outputs = gamma * normalized + beta
333
        return outputs
334
335
336
class RNN(BaseModel):
337
    """
338
    Build a vanilla recurrent neural network.
339
    """
340
    def __init__(self, data_loader, weights_for_finetune, init_std=0.05, freeze_opt=None, is_finetune=False):
341
        super().__init__()
342
        self.is_finetune = is_finetune
343
        self.freeze_opt = freeze_opt
344
        print ("freeze_opt: ", self.freeze_opt)
345
        if self.is_finetune:
346
            self.finetune_weights = weights_for_finetune
347
            self.learning_rate = 0.00001
348
            self.batch_size = 128
349
            self.num_epochs = 30
350
        else:
351
            self.learning_rate = 0.5
352
            self.batch_size = 128
353
            self.num_epochs = 200
354
355
        # training parameters
356
        self.dir_name =  "rnn"
357
        self.dropout = 1
358
        self.decay_rate = 0.9
359
        self.decay_steps = 10000 / self.batch_size
360
        self.momentum = 0.95
361
        self.patience = 5
362
        self.eval_frequency = self.num_epochs
363
364
        # Network Parameters
365
        self.init_std = init_std
366
        self.n_hidden = 256 # hidden dimensions of embedding
367
        self.n_hidden_1 = 128
368
        self.n_hidden_2 = 128
369
        self.n_words = data_loader.n_words
370
        self.num_input = data_loader.dim_input
371
        self.n_classes = FLAGS.n_classes
372
        self.timesteps = data_loader.timesteps
373
        self.code_size = data_loader.code_size
374
        self.dim_hidden = [self.n_hidden_1, self.n_hidden_2, FLAGS.n_classes]
375
376
        self.weights_for_init = dict() # to store the value of learned params
377
        self.weights_for_finetune = dict()
378
379
        self.build_model()
380
381
        print('method', self.dir_name, 'data shape:', self.num_input, 'batch size:', self.batch_size, 'learning rate:', self.learning_rate, \
382
              'momentum:', self.momentum, 'patience:', self.patience)
383
384
    # Methods to construct the computational graph
385
    def build_model(self):
386
        """Build the computational graph with memory network of the model."""
387
        self.graph = tf.Graph()
388
        with self.graph.as_default():
389
            # Inputs.
390
            with tf.name_scope('inputs'):
391
                # tf Graph input
392
                self.ph_data = tf.placeholder(tf.int32, (self.batch_size, self.timesteps, self.code_size), 'data')
393
                self.ph_labels = tf.placeholder(tf.int32, (self.batch_size), 'labels')
394
                self.ph_dropout = tf.placeholder(tf.float32, (), 'dropout')
395
                self.ph_training = tf.placeholder(tf.bool, name='trainingFlag')
396
397
            # Construct model
398
            op_logits = self._inference(self.ph_data, self.ph_dropout, self.ph_training)
399
            self.op_loss, self.op_loss_average = self.loss(op_logits)
400
            self.op_train = self.training(self.op_loss, self.learning_rate,
401
                    self.decay_steps, self.decay_rate, self.momentum)
402
            self.op_prediction = self._get_prediction(op_logits)
403
404
            # Initialize variables, i.e. weights and biases.
405
            self.op_init = tf.global_variables_initializer()
406
            if self.is_finetune is not True:
407
                self.op_weights = self.get_op_variables()
408
            else:
409
                print (tf.trainable_variables())
410
411
            # Summaries for TensorBoard and Save for model parameters.
412
            self.op_summary = tf.summary.merge_all()
413
            self.op_saver = tf.train.Saver(max_to_keep=5)
414
        self.graph.finalize()
415
416
    def get_op_variables(self):
417
        op_weights = dict()
418
        op_var = tf.trainable_variables()
419
420
        # embedding
421
        op_weights["emb_W"] = [v for v in op_var if "emb_W" in v.name][0]
422
        # lstm
423
        op_weights["lstm_W_xh"] = [v for v in op_var if "lstm_W_xh" in v.name][0]
424
        op_weights["lstm_W_hh"] = [v for v in op_var if "lstm_W_hh" in v.name][0]
425
        op_weights["lstm_b"] = [v for v in op_var if "lstm_b" in v.name][0]
426
        # fully connected
427
        for i, dim in enumerate(self.dim_hidden):
428
            op_weights["fc_W"+str(i)] = [v for v in op_var if "fc_W"+str(i) in v.name ][0]
429
            op_weights["fc_b"+str(i)] = [v for v in op_var if "fc_b"+str(i) in v.name][0]
430
        print ('show variable')
431
        print(op_var)
432
        return op_weights
433
434
435
    def build_emb_weights(self, weights):
436
        weights["emb_W"] = tf.Variable(tf.random_normal([self.n_words, self.n_hidden], stddev=self.init_std), name="emb_W")
437
        weights["emb_mask_W"] = tf.get_variable("mask_padding", initializer=MASK_ARRAY, dtype="float32", trainable=False)
438
        return weights
439
440
    def embedding(self, x, Wemb, Wemb_mask):
441
        _x = tf.nn.embedding_lookup(Wemb, x) # recs size is (batch_size, timesteps, n_words)
442
        _x_mask = tf.nn.embedding_lookup(Wemb_mask, x)
443
        emb_vecs = tf.multiply(_x, _x_mask) # broadcast
444
        emb_vecs = tf.reduce_sum(emb_vecs, 2)
445
        return emb_vecs
446
447
    def lstm_identity_initializer(self, scale):
448
        def _initializer(shape, dtype=tf.float32, partition_info=None):
449
            """Ugly cause LSTM params calculated in one matrix multiply"""
450
            size = shape[0]
451
            # gate (j) is identity
452
            t = np.zeros(shape)
453
            t[:, size:size * 2] = np.identity(size) * scale
454
            t[:, :size] = self.orthogonal([size, size])
455
            t[:, size * 2:size * 3] = self.orthogonal([size, size])
456
            t[:, size * 3:] = self.orthogonal([size, size])
457
            return tf.constant(t, dtype=dtype)
458
        return _initializer
459
460
    def orthogonal_initializer(self):
461
        def _initializer(shape, dtype=tf.float32, partition_info=None):
462
            return tf.constant(self.orthogonal(shape), dtype)
463
        return _initializer
464
465
    def orthogonal(self, shape):
466
        flat_shape = (shape[0], np.prod(shape[1:]))
467
        a = np.random.normal(0.0, 1.0, flat_shape)
468
        u, _, v = np.linalg.svd(a, full_matrices=False)
469
        q = u if u.shape == flat_shape else v
470
        return q.reshape(shape)
471
472
    def build_lstm_weights(self, weights):
473
        #
474
        # # Keep W_xh and W_hh separate here as well to reuse initialization methods
475
        # with tf.variable_scope(scope or type(self).__name__):
476
        weights["lstm_W_xh"] = tf.get_variable('lstm_W_xh', [self.n_hidden, 4 * self.n_hidden],
477
                               initializer=self.orthogonal_initializer())
478
        weights["lstm_W_hh"] = tf.get_variable('lstm_W_hh', [self.n_hidden, 4 * self.n_hidden],
479
                               initializer=self.lstm_identity_initializer(0.95),)
480
        weights["lstm_b"] = tf.get_variable('lstm_b', [4 * self.n_hidden])
481
        return weights
482
483
    # Create model
484
    def _inference(self, x, dropout, is_training=True):
485
        with tf.variable_scope('pretrain_model', reuse=None) as training_scope:
486
            if self.freeze_opt == None:
487
                weights = {}
488
                weights = self.build_emb_weights(weights)
489
                weights = self.build_lstm_weights(weights)
490
                weights = self.build_fc_weights(self.n_hidden, weights)
491
492
                # embedding
493
                with tf.variable_scope("embedding"):
494
                    xemb = self.embedding(x, weights["emb_W"], weights["emb_mask_W"])
495
496
                # recurrent neural networks
497
                with tf.variable_scope("rnn"):
498
                    lstm_cell = LSTMCell(self.n_hidden, weights["lstm_W_xh"], weights["lstm_W_hh"], weights["lstm_b"])
499
                    # lstm_cell = LSTMCell(self.n_hidden)
500
                    xemb = tf.unstack(xemb, self.timesteps, 1)
501
502
                    #c, h
503
                    W_state_c = tf.random_normal([self.batch_size, self.n_hidden], stddev=0.1)
504
                    W_state_h = tf.random_normal([self.batch_size, self.n_hidden], stddev=0.1)
505
                    outputs, state = tf.nn.static_rnn(lstm_cell, xemb, initial_state=(W_state_c, W_state_h), dtype=tf.float32)
506
                    _, hout = state
507
508
                with tf.variable_scope("dropout"):
509
                    h_ = layers.dropout(hout, keep_prob=dropout)
510
511
                for i, dim in enumerate(self.dim_hidden[:-1]):
512
                    h_ = self.fc(h_, weights["fc_W"+str(i)], weights["fc_b"+str(i)])
513
                    h_ = tf.nn.dropout(h_, dropout)
514
515
                # Logits linear layer, i.e. softmax without normalization.
516
                N, Min = h_.get_shape()
517
                i = len(self.dim_hidden)-1
518
                logits = self.fc(h_, weights["fc_W"+str(i)], weights["fc_b"+str(i)], relu=False)
519
520
            else:
521
                with tf.variable_scope("embedding"):
522
                    Wemb = self.finetune_weights["emb_W"]
523
                    Wemb_mask = tf.get_variable("mask_padding", initializer=MASK_ARRAY, dtype="float32", trainable=False)
524
                    xemb = self.embedding(x, Wemb, Wemb_mask)
525
526
527
                # convolutional network
528
                with tf.variable_scope("rnn"):
529
                    lstm_cell = LSTMCell(self.n_hidden, self.finetune_weights["lstm_W_xh"], self.finetune_weights["lstm_W_hh"], self.finetune_weights["lstm_b"])
530
                    xemb = tf.unstack(xemb, self.timesteps, 1)
531
                    W_state_c = tf.random_normal([self.batch_size, self.n_hidden], stddev=0.1)
532
                    W_state_h = tf.random_normal([self.batch_size, self.n_hidden], stddev=0.1)
533
                    outputs, state = tf.nn.static_rnn(lstm_cell, xemb, initial_state=(W_state_c, W_state_h), dtype=tf.float32)
534
                    _, hout = state
535
536
                with tf.variable_scope("dropout"):
537
                    h_ = layers.dropout(hout, keep_prob=dropout)
538
539
                for i, dim in enumerate(self.dim_hidden[:-1]):
540
                    Wfc = self.finetune_weights["fc_W"+str(i)]
541
                    bfc = self.finetune_weights["fc_b"+str(i)]
542
                    h_ = self.fc(h_, Wfc, bfc)
543
                    h_ = tf.nn.dropout(h_, dropout)
544
545
                # finetune the last layer
546
                i = len(self.dim_hidden)-1
547
                weights = {}
548
                dim_in = self.n_hidden_2
549
                weights["fc_W"+str(i)] = self.weight_variable([int(dim_in), FLAGS.n_classes], name="fc_W"+str(i))
550
                weights["fc_b"+str(i)] = self.bias_variable([FLAGS.n_classes], name="fc_b"+str(i))
551
552
                # Logits linear layer, i.e. softmax without normalization.
553
                N, Min = h_.get_shape()
554
                i = len(self.dim_hidden)-1
555
                logits = self.fc(h_, weights["fc_W"+str(i)], weights["fc_b"+str(i)], relu=False)
556
        return logits
557
558
559
class LSTMCell(RNNCell):
560
    '''Vanilla LSTM implemented with same initializations as BN-LSTM'''
561
    def __init__(self, num_units, W_xh, W_hh, bias):
562
        self.num_units = num_units
563
        self.W_xh = W_xh
564
        self.W_hh = W_hh
565
        self.bias = bias
566
567
    @property
568
    def state_size(self):
569
        return (self.num_units, self.num_units)
570
571
    @property
572
    def output_size(self):
573
        return self.num_units
574
575
    def __call__(self, x, state, scope=None):
576
        with tf.variable_scope(scope or type(self).__name__, reuse=tf.AUTO_REUSE):
577
            c, h = state
578
579
            # hidden = tf.matmul(x, W_xh) + tf.matmul(h, W_hh) + bias
580
            # improve speed by concat.
581
            concat = tf.concat([x, h], 1)
582
            W_both = tf.concat([self.W_xh, self.W_hh], 0)
583
            hidden = tf.matmul(concat, W_both) + self.bias
584
585
            i, j, f, o = tf.split(hidden, 4, axis=1)
586
587
            new_c = c * tf.sigmoid(f) + tf.sigmoid(i) * tf.tanh(j)
588
            new_h = tf.tanh(new_c) * tf.sigmoid(o)
589
590
            return new_h, (new_c, new_h)
591
592
593
class CNN(BaseModel):
594
    """
595
    Build a convolutional neural network.
596
    """
597
    def __init__(self, data_loader, weights_for_finetune, init_std=0.05, freeze_opt=None, is_finetune=False):
598
        super().__init__()
599
        self.is_finetune = is_finetune
600
        self.freeze_opt = freeze_opt
601
        print ("freeze_opt: ", self.freeze_opt)
602
        if self.is_finetune:
603
            self.finetune_weights = weights_for_finetune
604
            self.learning_rate = 0.00001
605
            self.batch_size = 64
606
            self.num_epochs = 30
607
        else:
608
            self.learning_rate = 0.1
609
            self.batch_size = 128
610
            self.num_epochs = 200
611
612
        # training parameters
613
        self.dir_name =  "cnn"
614
615
        self.dropout = 0.6
616
        self.decay_rate = 0.9
617
        self.decay_steps = 10000 / self.batch_size
618
        self.momentum = 0.95
619
        self.patience = 10
620
        self.eval_frequency = self.num_epochs
621
622
        # Network Parameters
623
        self.init_std = init_std
624
        self.n_hidden = 256 # hidden dimensions of embedding
625
        self.n_hidden_1 = 128
626
        self.n_hidden_2 = 128
627
        self.n_words = data_loader.n_words
628
        self.n_classes = FLAGS.n_classes
629
        self.n_filters = 128
630
        self.num_input = data_loader.dim_input
631
        self.timesteps = data_loader.timesteps
632
        self.code_size = data_loader.code_size
633
        self.dim_hidden = [self.n_hidden_1, self.n_hidden_2, FLAGS.n_classes]
634
        self.filter_sizes = [3, 4, 5]
635
636
        self.weights_for_init = dict() # to store the value of learned params
637
        self.weights_for_finetune = dict()
638
639
        print('method', self.dir_name, 'data shape:', self.num_input, 'batch size:', self.batch_size, 'learning rate:', self.learning_rate, \
640
              'momentum:', self.momentum, 'patience:', self.patience)
641
        self.build_model()
642
643
    # Methods to construct the computational graph
644
    def build_model(self):
645
        """Build the computational graph with memory network of the model."""
646
        self.graph = tf.Graph()
647
        with self.graph.as_default():
648
            # Inputs.
649
            with tf.name_scope('inputs'):
650
                # tf Graph input
651
                self.ph_data = tf.placeholder(tf.int32, (self.batch_size, self.timesteps, self.code_size), 'data')
652
                self.ph_labels = tf.placeholder(tf.int32, (self.batch_size), 'labels')
653
                self.ph_dropout = tf.placeholder(tf.float32, (), 'dropout')
654
                self.ph_training = tf.placeholder(tf.bool, name='trainingFlag')
655
656
            # Construct model
657
            op_logits = self._inference(self.ph_data, self.ph_dropout, self.ph_training)
658
            self.op_loss, self.op_loss_average = self.loss(op_logits)
659
            self.op_train = self.training(self.op_loss, self.learning_rate,
660
                    self.decay_steps, self.decay_rate, self.momentum)
661
            self.op_prediction = self._get_prediction(op_logits)
662
663
            # Initialize variables, i.e. weights and biases.
664
            self.op_init = tf.global_variables_initializer()
665
            if self.is_finetune is not True:
666
                self.op_weights = self.get_op_variables()
667
            else:
668
                print (tf.trainable_variables())
669
670
            # Summaries for TensorBoard and Save for model parameters.
671
            self.op_summary = tf.summary.merge_all()
672
            self.op_saver = tf.train.Saver(max_to_keep=5)
673
        self.graph.finalize()
674
675
    def get_op_variables(self):
676
        op_weights = dict()
677
        op_var = tf.trainable_variables()
678
        # embedding
679
        op_weights["emb_W"] = [v for v in op_var if "emb_W" in v.name][0]
680
        # cnn
681
        for i, filter_size in enumerate(self.filter_sizes):
682
            op_weights["conv_W"+str(filter_size)] = [v for v in op_var if "conv_W"+str(filter_size) in v.name][0]
683
            op_weights["conv_b"+str(filter_size)] = [v for v in op_var if "conv_b"+str(filter_size) in v.name][0]
684
        # fully connected
685
        for i, dim in enumerate(self.dim_hidden):
686
            op_weights["fc_W"+str(i)] = [v for v in op_var if "fc_W"+str(i) in v.name][0]
687
            op_weights["fc_b"+str(i)] = [v for v in op_var if "fc_b"+str(i) in v.name][0]
688
        return op_weights
689
690
    def build_emb_weights(self, weights):
691
        weights["emb_W"] = tf.Variable(tf.random_normal([self.n_words, self.n_hidden], stddev=self.init_std), name="emb_W")
692
        weights["emb_mask_W"] = tf.get_variable("mask_padding", initializer=MASK_ARRAY, dtype="float32", trainable=False)
693
        return weights
694
695
    def embedding(self, x, Wemb, Wemb_mask):
696
        _x = tf.nn.embedding_lookup(Wemb, x) # recs size is (batch_size, timesteps, n_words)
697
        _x_mask = tf.nn.embedding_lookup(Wemb_mask, x)
698
        emb_vecs = tf.multiply(_x, _x_mask) # broadcast
699
        emb_vecs = tf.reduce_sum(emb_vecs, 2)
700
        self.emb_expanded = tf.expand_dims(emb_vecs, -1)
701
        return emb_vecs
702
703
    def build_conv_weights(self, weights):
704
        for i, filter_size in enumerate(self.filter_sizes):
705
            filter_shape = [filter_size, self.n_hidden, 1, self.n_filters]
706
            weights["conv_W"+str(filter_size)] = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="conv_W"+str(filter_size))
707
            weights["conv_b"+str(filter_size)] = tf.Variable(tf.constant(0.1, shape=[self.n_filters]), name="conv_b"+str(filter_size))
708
        return weights
709
710
    def conv(self, weights, is_training):
711
        '''Create a convolution + maxpool layer for each filter size'''
712
        pooled_outputs = []
713
        for i, filter_size in enumerate(self.filter_sizes):
714
            W = weights["conv_W"+str(filter_size)]
715
            b = weights["conv_b"+str(filter_size)]
716
            with tf.name_scope("conv-maxpool-%s" % filter_size):
717
                # Convolution Layer
718
                conv_ = tf.nn.conv2d(
719
                    self.emb_expanded,
720
                    W,
721
                    strides=[1, 1, 1, 1],
722
                    padding="VALID",
723
                    name="conv")
724
                # Apply nonlinearity
725
                h = tf.nn.leaky_relu(tf.nn.bias_add(conv_, b), name="relu")
726
                # h = layers.batch_norm(h, updates_collections=None,
727
                #                          decay=0.99,
728
                #                          scale=True, center=True,
729
                #                          is_training=is_training)
730
                # Maxpooling over the outputs
731
                pooled = tf.nn.max_pool(
732
                h,
733
                ksize=[1, self.timesteps - filter_size + 1, 1, 1],
734
                strides=[1, 1, 1, 1],
735
                padding='VALID',
736
                name="pool")
737
                pooled_outputs.append(pooled)
738
739
        # Combine all the pooled features
740
        num_filters_total = self.n_filters * len(self.filter_sizes)
741
        h_pool = tf.concat(pooled_outputs, 3)
742
        h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
743
        return h_pool_flat
744
745
    # Create model
746
    def _inference(self, x, dropout, is_training=True):
747
        with tf.variable_scope('pretrain_model', reuse=None) as training_scope:
748
            weights = {}
749
            if self.freeze_opt == None:
750
                weights = self.build_emb_weights(weights)
751
                weights = self.build_conv_weights(weights)
752
                weights = self.build_fc_weights(self.n_filters * len(self.filter_sizes), weights)
753
754
                with tf.variable_scope("embedding"):
755
                    self.embedding(x, weights["emb_W"], weights["emb_mask_W"])
756
757
                # convolutional network
758
                with tf.variable_scope("conv"):
759
                    hout = self.conv(weights, is_training)
760
761
                with tf.variable_scope("dropout"):
762
                    h_ = layers.dropout(hout, keep_prob=dropout)
763
764
                for i, dim in enumerate(self.dim_hidden[:-1]):
765
                    h_ = self.fc(h_, weights["fc_W"+str(i)], weights["fc_b"+str(i)])
766
                    h_ = tf.nn.dropout(h_, dropout)
767
768
                # Logits linear layer, i.e. softmax without normalization.
769
                N, Min = h_.get_shape()
770
                i = len(self.dim_hidden)-1
771
                logits = self.fc(h_, weights["fc_W"+str(i)], weights["fc_b"+str(i)], relu=False)
772
773
            else:
774
                with tf.variable_scope("embedding"):
775
                    Wemb = self.finetune_weights["emb_W"]
776
                    Wemb_mask = tf.get_variable("mask_padding", initializer=MASK_ARRAY, dtype="float32", trainable=False)
777
                    self.embedding(x, Wemb, Wemb_mask)
778
779
                # convolutional network
780
                with tf.variable_scope("conv"):
781
                    # w = {}
782
                    # for i, filter_size in enumerate(self.filter_sizes):
783
                    #     w["conv_W"+str(filter_size)] = self.finetune_weights["conv_W"+str(filter_size)]
784
                    #     w["conv_b"+str(filter_size)] = self.finetune_weights["conv_b"+str(filter_size)]
785
                    hout = self.conv(self.finetune_weights, is_training)
786
787
                with tf.variable_scope("dropout"):
788
                    h_ = layers.dropout(hout, keep_prob=dropout)
789
790
                for i, dim in enumerate(self.dim_hidden[:-1]):
791
                    Wfc = self.finetune_weights["fc_W"+str(i)]
792
                    bfc = self.finetune_weights["fc_b"+str(i)]
793
                    h_ = self.fc(h_, Wfc, bfc)
794
                    h_ = tf.nn.dropout(h_, dropout)
795
796
                # finetune the last layer
797
                i = len(self.dim_hidden)-1
798
                weights = {}
799
                dim_in = self.n_hidden_2
800
                weights["fc_W"+str(i)] = self.weight_variable([int(dim_in), FLAGS.n_classes], name="fc_W"+str(i))
801
                weights["fc_b"+str(i)] = self.bias_variable([FLAGS.n_classes], name="fc_b"+str(i))
802
803
                # Logits linear layer, i.e. softmax without normalization.
804
                N, Min = h_.get_shape()
805
                i = len(self.dim_hidden)-1
806
                logits = self.fc(h_, weights["fc_W"+str(i)], weights["fc_b"+str(i)], relu=False)
807
808
        return logits