[d2c46b]: / diff_sex / yellowfin.py

Download this file

472 lines (409 with data), 19.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
"""
YellowFin optimizer.
YellowFin and the Art of Momentum Tuning
https://arxiv.org/abs/1706.03471
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
from tensorflow.python.framework import ops
# EPS for numerical stability
EPS = 1e-6
LARGE_FLOAT_VAL = 1e15
class YFOptimizer(object):
"""
Optimizer that implements the YellowFin algorithm.
Implemented as a wrapper around tf.train.MomentumOptimizer
"""
# Available gate_gradients values
GATE_NONE = tf.train.Optimizer.GATE_NONE
GATE_OP = tf.train.Optimizer.GATE_OP
GATE_GRAPH = tf.train.Optimizer.GATE_GRAPH
def __init__(self, learning_rate=0.0001, momentum=0.0, clip_thresh=None,
beta=0.999, curv_win_width=20, zero_debias=True, delta_mu=0.0,
sparsity_debias=False, use_locking=False, name="YellowFin",
use_nesterov=False, use_unsmoothed_lr_mu=True,
h_max_log_smooth=True, h_min_log_smooth=True,
use_adapt_grad_clip=True, stat_protect_fac=100.0):
"""
Construct a new YellowFin optimizer.
Args:
learning rate: Python scalar. The initial value of learning rate,
we use 1.0 in our paper.
momentum: Python scalar. The initial value of momentum, we use
0.0 in our paper.
clip_thresh: Python scalar. The cliping threshold for
`tf.clip_by_global_norm`. If None, no clipping will be used.
beta: Python scalar. The smoothing parameter for estimations.
curv_win_width: TODO
zero_debias: TODO
delta_mu: for extensions. Not necessary in the basic use.
sparsity_debias: Python boolean. Gradient norm and curvature are
biased to larger values when calculated with sparse gradient.
This is useful when the model is very sparse, e.g. LSTM with
word embedding. For non-sparse CNN, turning it off could
slightly accelerate the speed.
use_locking: If True, use locks for update operations.
name: Optional name prefix for the operations created when
applying gradients. Defaults to "YellowFin".
use_nesterov: If True, the underlying MomentumOptimizer uses Nesterov
Momentum. Set to False in the default YellowFin algorithm.
Notes:
`clip_thresh` is the threshold value on ||lr * gradient||
`delta_mu` can be a placeholder/variable/python scalar. Used for
additional momentum in situations such as asynchronous-parallel
training. The default is 0.0 for basic usage of the optimizer.
Other features:
If you want to manually control the learning rates,
`self.lr_factor` is an interface to the outside. It is a
multiplier for the internal learning rate in YellowFin. It is
helpful when you want to do additional hand tuning or some
decaying scheme for the internal learning rate. Example on using
`lr_factor` can be found here:
https://github.com/JianGoForIt/YellowFin/blob/master/char-rnn-tensorflow/train_YF.py#L140
"""
self._lr = learning_rate
self._mu = momentum
self._lr_var = tf.Variable(
learning_rate, dtype=tf.float32, name="YF_lr", trainable=False)
self._mu_var = tf.Variable(
momentum, dtype=tf.float32, name="YF_mu", trainable=False)
# for step scheme or decaying scheme for the learning rates
self.lr_factor = tf.Variable(
1.0, dtype=tf.float32, name="YF_lr_factor", trainable=False)
if clip_thresh is not None:
self._clip_thresh_var = tf.Variable(
clip_thresh, dtype=tf.float32, name="YF_clip_thresh",
trainable=False)
else:
self._clip_thresh_var = None
# the underlying momentum optimizer
self._optimizer = tf.train.MomentumOptimizer(
self._lr_var * self.lr_factor, self._mu_var + delta_mu,
use_locking, name, use_nesterov)
# moving average for statistics
self._beta = beta
self._moving_averager = None
# for global step counting
self._global_step = tf.Variable(0, trainable=False)
self._do_tune = tf.greater(self._global_step, tf.constant(0) )
self._zero_debias = zero_debias
self._sparsity_debias = sparsity_debias
self._tvars = None
# for curvature range
self._curv_win_width = curv_win_width
self._curv_win = None
# option for using smoothed or unsmoothed lr and mu
self._use_unsmoothed_lr_mu = use_unsmoothed_lr_mu
# options for curvature envelop smoothing
self._h_max_log_smooth = h_max_log_smooth
self._h_min_log_smooth = h_min_log_smooth
# for adaptive gradient clipping
self._use_adapt_grad_clip = use_adapt_grad_clip
self._adapt_grad_clip_thresh = \
tf.Variable(LARGE_FLOAT_VAL, dtype=tf.float32, trainable=False)
self._adapt_grad_clip_target_val = \
tf.Variable(LARGE_FLOAT_VAL, dtype=tf.float32, trainable=False)
# prevent exploding gradient from ruining the statistics
self._stat_protect_fac = stat_protect_fac
def curvature_range(self):
# set up the curvature window
self._curv_win = tf.Variable(
np.zeros([self._curv_win_width, ]), dtype=tf.float32,
name="curv_win", trainable=False)
# we can use log smoothing for curvature range to follow trend faster
# self._curv_win = tf.scatter_update(
# self._curv_win, self._global_step % self._curv_win_width,
# tf.log(self._grad_norm_squared + EPS))
self._curv_win = tf.scatter_update(
self._curv_win, self._global_step % self._curv_win_width,
self._grad_norm_squared + EPS)
# note here the iterations start from iteration 0
valid_window = tf.slice(
self._curv_win, tf.constant([0, ]), tf.expand_dims(
tf.minimum(tf.constant(self._curv_win_width),
self._global_step + 1), dim=0))
if self._h_min_log_smooth:
self._h_min_t = tf.log(tf.reduce_min(valid_window) + EPS)
else:
self._h_min_t = tf.reduce_min(valid_window)
if self._h_max_log_smooth:
self._h_max_t = tf.log(tf.reduce_max(valid_window) + EPS)
else:
self._h_max_t = tf.reduce_max(valid_window)
curv_range_ops = []
with tf.control_dependencies([self._h_min_t, self._h_max_t] ):
avg_op = self._moving_averager.apply(
[self._h_min_t, self._h_max_t])
with tf.control_dependencies([avg_op]):
if self._h_min_log_smooth:
self._h_min = tf.exp(
tf.identity(self._moving_averager.average(self._h_min_t)))
else:
self._h_min = \
tf.identity(self._moving_averager.average(self._h_min_t))
if self._h_max_log_smooth:
self._h_max = tf.exp(
tf.identity(self._moving_averager.average(self._h_max_t)))
else:
self._h_max = \
tf.identity(self._moving_averager.average(self._h_max_t))
if self._sparsity_debias:
self._h_min = self._h_min * self._sparsity_avg
self._h_max = self._h_max * self._sparsity_avg
curv_range_ops.append(avg_op)
return curv_range_ops
def grad_variance(self):
grad_var_ops = []
tensor_to_avg = []
for t, g in zip(self._tvars, self._grads):
if isinstance(g, ops.IndexedSlices):
tensor_to_avg.append(
tf.reshape(tf.unsorted_segment_sum(
g.values, g.indices, g.dense_shape[0]),
shape=t.get_shape()))
else:
tensor_to_avg.append(g)
avg_op = self._moving_averager.apply(tensor_to_avg)
grad_var_ops.append(avg_op)
with tf.control_dependencies([avg_op]):
self._grad_avg = [
self._moving_averager.average(val) for val in tensor_to_avg]
self._grad_avg_squared = [tf.square(val) for val in self._grad_avg]
self._grad_var = tf.maximum(
tf.constant(EPS, dtype=self._grad_norm_squared_avg.dtype),
self._grad_norm_squared_avg
- tf.add_n([tf.reduce_sum(val) for val in self._grad_avg_squared] ) )
if self._sparsity_debias:
self._grad_var *= self._sparsity_avg
return grad_var_ops
def dist_to_opt(self):
dist_to_opt_ops = []
# running average of the norm of gradeint
self._grad_norm = tf.sqrt(self._grad_norm_squared)
avg_op = self._moving_averager.apply([self._grad_norm, ])
dist_to_opt_ops.append(avg_op)
with tf.control_dependencies([avg_op]):
self._grad_norm_avg = self._moving_averager.average(
self._grad_norm)
# single iteration distance estimation
# note that self._grad_norm_avg is per variable
self._dist_to_opt = (self._grad_norm_avg
/ (self._grad_norm_squared_avg + EPS) )
# running average of distance
avg_op = self._moving_averager.apply([self._dist_to_opt])
dist_to_opt_ops.append(avg_op)
with tf.control_dependencies([avg_op]):
self._dist_to_opt_avg = tf.identity(
self._moving_averager.average(self._dist_to_opt))
if self._sparsity_debias:
self._dist_to_opt_avg /= (tf.sqrt(self._sparsity_avg) + EPS)
return dist_to_opt_ops
def grad_sparsity(self):
# If the sparse minibatch gradient has 10 percent of its entries
# non-zero, its sparsity is 0.1.
# The norm of dense gradient averaged from full dataset
# are roughly estimated norm of minibatch
# sparse gradient norm * sqrt(sparsity)
# An extension maybe only correct the sparse blob.
non_zero_cnt = tf.add_n([tf.count_nonzero(g) for g in self._grads])
all_entry_cnt = tf.add_n([tf.size(g) for g in self._grads])
self._sparsity = tf.cast(non_zero_cnt, self._grads[0].dtype) \
/ tf.cast(all_entry_cnt, self._grads[0].dtype)
avg_op = self._moving_averager.apply([self._sparsity, ])
with tf.control_dependencies([avg_op]):
self._sparsity_avg = self._moving_averager.average(self._sparsity)
return avg_op
def before_apply(self):
self._moving_averager = tf.train.ExponentialMovingAverage(
decay=self._beta, zero_debias=self._zero_debias)
assert self._grads is not None and len(self._grads) > 0
before_apply_ops = []
# get per var g**2 and norm**2
self._grad_squared = []
self._grad_norm_squared = []
for v, g in zip(self._tvars, self._grads):
if g is None:
continue
with ops.colocate_with(v):
self._grad_squared.append(tf.square(g))
self._grad_norm_squared = [
tf.reduce_sum(grad_squared) for grad_squared in self._grad_squared]
if self._sparsity_debias:
avg_op_sparsity = self.grad_sparsity()
before_apply_ops.append(avg_op_sparsity)
# the following running average on squared norm of gradient is shared
# by `grad_variance` and `dist_to_opt`
avg_op = self._moving_averager.apply(self._grad_norm_squared)
with tf.control_dependencies([avg_op]):
self._grad_norm_squared_avg = [self._moving_averager.average(val)
for val in self._grad_norm_squared]
self._grad_norm_squared = tf.add_n(self._grad_norm_squared)
self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg)
before_apply_ops.append(avg_op)
with tf.control_dependencies([avg_op]):
curv_range_ops = self.curvature_range()
before_apply_ops += curv_range_ops
grad_var_ops = self.grad_variance()
before_apply_ops += grad_var_ops
dist_to_opt_ops = self.dist_to_opt()
before_apply_ops += dist_to_opt_ops
return tf.group(*before_apply_ops)
def get_lr_tensor(self):
lr = (1.0 - tf.sqrt(self._mu))**2 / (self._h_min + EPS)
lr = tf.minimum(lr, lr * (tf.to_float(self._global_step) + 1.0) / 10.0 / tf.to_float(tf.constant(self._curv_win_width) ) )
return lr
def get_cubic_root(self):
# We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
# where x = sqrt(mu).
# We substitute x, which is sqrt(mu), with x = y + 1.
# It gives y^3 + py = q
# where p = (D^2 h_min^2)/(2*C) and q = -p.
# We use the Vieta's substution to compute the root.
# There is only one real solution y (which is in [0, 1] ).
# http://mathworld.wolfram.com/VietasSubstitution.html
# assert_array = \
# [tf.Assert(tf.logical_not(tf.is_nan(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]),
# tf.Assert(tf.logical_not(tf.is_nan(self._h_min) ), [self._h_min,]),
# tf.Assert(tf.logical_not(tf.is_nan(self._grad_var) ), [self._grad_var,]),
# tf.Assert(tf.logical_not(tf.is_inf(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]),
# tf.Assert(tf.logical_not(tf.is_inf(self._h_min) ), [self._h_min,]),
# tf.Assert(tf.logical_not(tf.is_inf(self._grad_var) ), [self._grad_var,])]
# with tf.control_dependencies(assert_array):
# EPS in the numerator to prevent momentum being exactly one in case of 0 gradient
p = (self._dist_to_opt_avg + EPS)**2 * (self._h_min + EPS)**2 / 2 / (self._grad_var + EPS)
w3 = (-tf.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0
w = tf.sign(w3) * tf.pow(tf.abs(w3), 1.0/3.0)
y = w - p / 3.0 / (w + EPS)
x = y + 1
return x
def get_mu_tensor(self):
root = self.get_cubic_root()
dr = tf.maximum( (self._h_max + EPS) / (self._h_min + EPS), 1.0 + EPS)
mu = tf.maximum(
root**2, ((tf.sqrt(dr) - 1) / (tf.sqrt(dr) + 1))**2)
return mu
def update_hyper_param(self):
assign_hyper_ops = []
self._mu = tf.identity(tf.cond(
self._do_tune, lambda: self.get_mu_tensor(),
lambda: self._mu_var))
with tf.control_dependencies([self._mu]):
self._lr = tf.identity(tf.cond(
self._do_tune, lambda: self.get_lr_tensor(),
lambda: self._lr_var))
with tf.control_dependencies([self._mu, self._lr]):
if self._use_unsmoothed_lr_mu:
assign_hyper_ops.append(tf.assign(self._mu_var, self._mu) )
assign_hyper_ops.append(tf.assign(self._lr_var, self._lr) )
else:
self._mu = self._beta * self._mu_var + (1 - self._beta) * self._mu
self._lr = self._beta * self._lr_var + (1 - self._beta) * self._lr
with tf.control_dependencies([self._mu, self._lr] ):
assign_hyper_ops.append(tf.assign(self._mu_var, self._mu) )
assign_hyper_ops.append(tf.assign(self._lr_var, self._lr) )
assign_hyper_op = tf.group(*assign_hyper_ops)
return assign_hyper_op
def get_name(self):
return self._optimizer.get_name()
def apply_gradients(self, grads_tvars, global_step=None, name=None):
self._grads, self._tvars = zip(
*[(g, t) for g, t in grads_tvars if g is not None])
# for manual gradient clipping
if self._clip_thresh_var is not None:
self._grads, self._grads_norm = tf.clip_by_global_norm(
self._grads, self._clip_thresh_var)
# loosely adaptive clipping of gradient in case exploding gradient ruins statistics
if self._use_adapt_grad_clip:
thresh = tf.cond(self._do_tune,
lambda: tf.sqrt(self._stat_protect_fac * self._adapt_grad_clip_thresh**2),
lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
self._grads, self._grads_norm = tf.clip_by_global_norm(self._grads, thresh)
with tf.variable_scope("before_apply"):
before_apply_op = self.before_apply()
with tf.variable_scope("update_hyper"):
with tf.control_dependencies([before_apply_op]):
update_hyper_op = self.update_hyper_param()
with tf.variable_scope("apply_updates"):
with tf.control_dependencies([update_hyper_op]):
# clip exploding gradient according to h_max
if self._use_adapt_grad_clip:
thresh = tf.cond(tf.greater(tf.global_norm(self._grads),
self._adapt_grad_clip_thresh),
lambda: self._adapt_grad_clip_target_val,
lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
self._grads, self._grads_norm = tf.clip_by_global_norm(
self._grads, thresh)
apply_grad_op = self._optimizer.apply_gradients(
zip(self._grads, self._tvars), global_step, name)
with tf.control_dependencies([apply_grad_op]):
self._increment_global_step_op = tf.assign(
self._global_step, self._global_step + 1)
self._adapt_grad_clip_thresh_op = \
tf.assign(self._adapt_grad_clip_thresh, tf.sqrt(self._h_max) )
self._adapt_grad_clip_target_val_op = \
tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(self._h_max) )
# self._adapt_grad_clip_target_val_op = \
# tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(tf.sqrt(self._h_max * self._h_min)))
return tf.group(before_apply_op, update_hyper_op, apply_grad_op,
self._adapt_grad_clip_thresh_op, self._adapt_grad_clip_target_val_op,
self._increment_global_step_op)
def compute_gradients(self, loss, var_list=None,
gate_gradients=GATE_OP,
aggregation_method=None,
colocate_gradients_with_ops=False,
grad_loss=None):
return self._optimizer.compute_gradients(
loss, var_list=var_list,
gate_gradients=gate_gradients,
aggregation_method=aggregation_method,
colocate_gradients_with_ops=colocate_gradients_with_ops,
grad_loss=grad_loss)
def minimize(self, loss, global_step=None, var_list=None,
gate_gradients=GATE_OP,
aggregation_method=None,
colocate_gradients_with_ops=False,
name=None,
grad_loss=None):
"""Add operations to minimize `loss` by updating `var_list`.
This method simply combines calls `compute_gradients()` and
`apply_gradients()`. If you want to process the gradient before
applying them, call `tf.gradients()` and `self.apply_gradients()`
explicitly instead of using this function.
Adapted from Tensorflow Optimizer base class member function.
"""
grads_and_vars = self._optimizer.compute_gradients(
loss, var_list=var_list,
gate_gradients=gate_gradients,
aggregation_method=aggregation_method,
colocate_gradients_with_ops=colocate_gradients_with_ops,
grad_loss=grad_loss)
vars_with_grad = [v for g, v in grads_and_vars if g is not None]
if not vars_with_grad:
raise ValueError(
"No gradients provided for any variable, check your graph for "
"ops that do not support gradients, between variables "
"%s and loss %s." %
([str(v) for _, v in grads_and_vars], loss))
return self.apply_gradients(grads_and_vars, global_step, name)
def get_slot(self, var, name):
"""
Return a slot named `name` created for `var` by
the underlying MomentumOptimizer.
Args:
var: A variable passed to `minimize()` or `apply_gradients()`.
name: A string.
Returns:
The `Variable` for the slot if it was created, `None` otherwise.
"""
return self._optimizer.get_slot(var, name)
def get_slot_names(self):
"""
Return a list of the names of the slots created by the
underlying MomentumOptimizer.
Returns:
A list of strings.
"""
return self._optimizer.get_slot_names()