--- a
+++ b/diff_sex/yellowfin.py
@@ -0,0 +1,471 @@
+"""
+YellowFin optimizer.
+
+YellowFin and the Art of Momentum Tuning
+https://arxiv.org/abs/1706.03471
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+import tensorflow as tf
+from tensorflow.python.framework import ops
+
+# EPS for numerical stability
+EPS = 1e-6
+LARGE_FLOAT_VAL = 1e15
+
+class YFOptimizer(object):
+  """
+  Optimizer that implements the YellowFin algorithm.
+
+  Implemented as a wrapper around tf.train.MomentumOptimizer
+  """
+  # Available gate_gradients values
+  GATE_NONE = tf.train.Optimizer.GATE_NONE
+  GATE_OP = tf.train.Optimizer.GATE_OP
+  GATE_GRAPH = tf.train.Optimizer.GATE_GRAPH
+
+  def __init__(self, learning_rate=0.0001, momentum=0.0, clip_thresh=None,
+               beta=0.999, curv_win_width=20, zero_debias=True, delta_mu=0.0,
+               sparsity_debias=False, use_locking=False, name="YellowFin",
+               use_nesterov=False, use_unsmoothed_lr_mu=True,
+               h_max_log_smooth=True, h_min_log_smooth=True,
+               use_adapt_grad_clip=True, stat_protect_fac=100.0):
+    """
+    Construct a new YellowFin optimizer.
+
+    Args:
+      learning rate: Python scalar. The initial value of learning rate,
+        we use 1.0 in our paper.
+      momentum: Python scalar. The initial value of momentum, we use
+        0.0 in our paper.
+      clip_thresh: Python scalar. The cliping threshold for
+        `tf.clip_by_global_norm`. If None, no clipping will be used.
+      beta: Python scalar. The smoothing parameter for estimations.
+      curv_win_width: TODO
+      zero_debias: TODO
+      delta_mu: for extensions. Not necessary in the basic use.
+      sparsity_debias: Python boolean. Gradient norm and curvature are
+        biased to larger values when calculated with sparse gradient.
+        This is useful when the model is very sparse, e.g. LSTM with
+        word embedding. For non-sparse CNN, turning it off could
+        slightly accelerate the speed.
+      use_locking: If True, use locks for update operations.
+      name: Optional name prefix for the operations created when
+        applying gradients. Defaults to "YellowFin".
+      use_nesterov: If True, the underlying MomentumOptimizer uses Nesterov
+        Momentum. Set to False in the default YellowFin algorithm.
+
+    Notes:
+      `clip_thresh` is the threshold value on ||lr * gradient||
+      `delta_mu` can be a placeholder/variable/python scalar. Used for
+      additional momentum in situations such as asynchronous-parallel
+      training. The default is 0.0 for basic usage of the optimizer.
+
+    Other features:
+      If you want to manually control the learning rates,
+      `self.lr_factor` is an interface to the outside. It is a
+      multiplier for the internal learning rate in YellowFin. It is
+      helpful when you want to do additional hand tuning or some
+      decaying scheme for the internal learning rate. Example on using
+      `lr_factor` can be found here:
+      https://github.com/JianGoForIt/YellowFin/blob/master/char-rnn-tensorflow/train_YF.py#L140
+    """
+    self._lr = learning_rate
+    self._mu = momentum
+
+    self._lr_var = tf.Variable(
+      learning_rate, dtype=tf.float32, name="YF_lr", trainable=False)
+    self._mu_var = tf.Variable(
+      momentum, dtype=tf.float32, name="YF_mu", trainable=False)
+    # for step scheme or decaying scheme for the learning rates
+    self.lr_factor = tf.Variable(
+      1.0, dtype=tf.float32, name="YF_lr_factor", trainable=False)
+    if clip_thresh is not None:
+      self._clip_thresh_var = tf.Variable(
+        clip_thresh, dtype=tf.float32, name="YF_clip_thresh",
+        trainable=False)
+    else:
+      self._clip_thresh_var = None
+
+    # the underlying momentum optimizer
+    self._optimizer = tf.train.MomentumOptimizer(
+      self._lr_var * self.lr_factor, self._mu_var + delta_mu,
+      use_locking, name, use_nesterov)
+
+    # moving average for statistics
+    self._beta = beta
+    self._moving_averager = None
+
+    # for global step counting
+    self._global_step = tf.Variable(0, trainable=False)
+
+    self._do_tune = tf.greater(self._global_step, tf.constant(0) )
+
+    self._zero_debias = zero_debias
+    self._sparsity_debias = sparsity_debias
+
+    self._tvars = None
+
+    # for curvature range
+    self._curv_win_width = curv_win_width
+    self._curv_win = None
+
+    # option for using smoothed or unsmoothed lr and mu
+    self._use_unsmoothed_lr_mu = use_unsmoothed_lr_mu
+
+    # options for curvature envelop smoothing
+    self._h_max_log_smooth = h_max_log_smooth
+    self._h_min_log_smooth = h_min_log_smooth
+
+    # for adaptive gradient clipping
+    self._use_adapt_grad_clip = use_adapt_grad_clip
+    self._adapt_grad_clip_thresh = \
+      tf.Variable(LARGE_FLOAT_VAL, dtype=tf.float32, trainable=False)
+    self._adapt_grad_clip_target_val = \
+      tf.Variable(LARGE_FLOAT_VAL, dtype=tf.float32, trainable=False)
+
+    # prevent exploding gradient from ruining the statistics
+    self._stat_protect_fac = stat_protect_fac
+
+  def curvature_range(self):
+    # set up the curvature window
+    self._curv_win = tf.Variable(
+      np.zeros([self._curv_win_width, ]), dtype=tf.float32,
+      name="curv_win", trainable=False)
+    # we can use log smoothing for curvature range to follow trend faster
+    # self._curv_win = tf.scatter_update(
+    #   self._curv_win, self._global_step % self._curv_win_width,
+    #   tf.log(self._grad_norm_squared + EPS))
+    self._curv_win = tf.scatter_update(
+      self._curv_win, self._global_step % self._curv_win_width,
+      self._grad_norm_squared + EPS)
+    # note here the iterations start from iteration 0
+    valid_window = tf.slice(
+      self._curv_win, tf.constant([0, ]), tf.expand_dims(
+        tf.minimum(tf.constant(self._curv_win_width),
+                   self._global_step + 1), dim=0))
+
+    if self._h_min_log_smooth:
+      self._h_min_t = tf.log(tf.reduce_min(valid_window) + EPS)
+    else:
+      self._h_min_t = tf.reduce_min(valid_window)
+    if self._h_max_log_smooth:
+      self._h_max_t = tf.log(tf.reduce_max(valid_window) + EPS)
+    else:
+      self._h_max_t = tf.reduce_max(valid_window)
+
+    curv_range_ops = []
+    with tf.control_dependencies([self._h_min_t, self._h_max_t] ):
+      avg_op = self._moving_averager.apply(
+        [self._h_min_t, self._h_max_t])
+      with tf.control_dependencies([avg_op]):
+        if self._h_min_log_smooth:
+          self._h_min = tf.exp(
+            tf.identity(self._moving_averager.average(self._h_min_t)))
+        else:
+          self._h_min = \
+            tf.identity(self._moving_averager.average(self._h_min_t))
+        if self._h_max_log_smooth:
+          self._h_max = tf.exp(
+            tf.identity(self._moving_averager.average(self._h_max_t)))
+        else:
+          self._h_max = \
+            tf.identity(self._moving_averager.average(self._h_max_t))
+      if self._sparsity_debias:
+        self._h_min = self._h_min * self._sparsity_avg
+        self._h_max = self._h_max * self._sparsity_avg
+    curv_range_ops.append(avg_op)
+    return curv_range_ops
+
+  def grad_variance(self):
+    grad_var_ops = []
+    tensor_to_avg = []
+    for t, g in zip(self._tvars, self._grads):
+      if isinstance(g, ops.IndexedSlices):
+        tensor_to_avg.append(
+          tf.reshape(tf.unsorted_segment_sum(
+            g.values, g.indices, g.dense_shape[0]),
+            shape=t.get_shape()))
+      else:
+        tensor_to_avg.append(g)
+    avg_op = self._moving_averager.apply(tensor_to_avg)
+    grad_var_ops.append(avg_op)
+    with tf.control_dependencies([avg_op]):
+      self._grad_avg = [
+        self._moving_averager.average(val) for val in tensor_to_avg]
+      self._grad_avg_squared = [tf.square(val) for val in self._grad_avg]
+    self._grad_var = tf.maximum(
+      tf.constant(EPS, dtype=self._grad_norm_squared_avg.dtype),
+      self._grad_norm_squared_avg
+      - tf.add_n([tf.reduce_sum(val) for val in self._grad_avg_squared] ) )
+    if self._sparsity_debias:
+      self._grad_var *= self._sparsity_avg
+    return grad_var_ops
+
+  def dist_to_opt(self):
+    dist_to_opt_ops = []
+    # running average of the norm of gradeint
+    self._grad_norm = tf.sqrt(self._grad_norm_squared)
+    avg_op = self._moving_averager.apply([self._grad_norm, ])
+    dist_to_opt_ops.append(avg_op)
+    with tf.control_dependencies([avg_op]):
+      self._grad_norm_avg = self._moving_averager.average(
+        self._grad_norm)
+      # single iteration distance estimation
+      # note that self._grad_norm_avg is per variable
+      self._dist_to_opt = (self._grad_norm_avg
+                 / (self._grad_norm_squared_avg + EPS) )
+    # running average of distance
+    avg_op = self._moving_averager.apply([self._dist_to_opt])
+    dist_to_opt_ops.append(avg_op)
+    with tf.control_dependencies([avg_op]):
+      self._dist_to_opt_avg = tf.identity(
+        self._moving_averager.average(self._dist_to_opt))
+      if self._sparsity_debias:
+        self._dist_to_opt_avg /= (tf.sqrt(self._sparsity_avg) + EPS)
+    return dist_to_opt_ops
+
+  def grad_sparsity(self):
+    # If the sparse minibatch gradient has 10 percent of its entries
+    # non-zero, its sparsity is 0.1.
+    # The norm of dense gradient averaged from full dataset
+    # are roughly estimated norm of minibatch
+    # sparse gradient norm * sqrt(sparsity)
+    # An extension maybe only correct the sparse blob.
+    non_zero_cnt = tf.add_n([tf.count_nonzero(g) for g in self._grads])
+    all_entry_cnt = tf.add_n([tf.size(g) for g in self._grads])
+    self._sparsity = tf.cast(non_zero_cnt, self._grads[0].dtype) \
+      / tf.cast(all_entry_cnt, self._grads[0].dtype)
+    avg_op = self._moving_averager.apply([self._sparsity, ])
+    with tf.control_dependencies([avg_op]):
+      self._sparsity_avg = self._moving_averager.average(self._sparsity)
+    return avg_op
+
+  def before_apply(self):
+    self._moving_averager = tf.train.ExponentialMovingAverage(
+      decay=self._beta, zero_debias=self._zero_debias)
+    assert self._grads is not None and len(self._grads) > 0
+    before_apply_ops = []
+
+    # get per var g**2 and norm**2
+    self._grad_squared = []
+    self._grad_norm_squared = []
+    for v, g in zip(self._tvars, self._grads):
+      if g is None:
+        continue
+      with ops.colocate_with(v):
+        self._grad_squared.append(tf.square(g))
+    self._grad_norm_squared = [
+      tf.reduce_sum(grad_squared) for grad_squared in self._grad_squared]
+
+    if self._sparsity_debias:
+      avg_op_sparsity = self.grad_sparsity()
+      before_apply_ops.append(avg_op_sparsity)
+
+    # the following running average on squared norm of gradient is shared
+    # by `grad_variance` and `dist_to_opt`
+    avg_op = self._moving_averager.apply(self._grad_norm_squared)
+    with tf.control_dependencies([avg_op]):
+      self._grad_norm_squared_avg = [self._moving_averager.average(val)
+                                     for val in self._grad_norm_squared]
+      self._grad_norm_squared = tf.add_n(self._grad_norm_squared)
+      self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg)
+    before_apply_ops.append(avg_op)
+
+    with tf.control_dependencies([avg_op]):
+      curv_range_ops = self.curvature_range()
+      before_apply_ops += curv_range_ops
+      grad_var_ops = self.grad_variance()
+      before_apply_ops += grad_var_ops
+      dist_to_opt_ops = self.dist_to_opt()
+      before_apply_ops += dist_to_opt_ops
+    return tf.group(*before_apply_ops)
+
+  def get_lr_tensor(self):
+    lr = (1.0 - tf.sqrt(self._mu))**2 / (self._h_min + EPS)
+    lr = tf.minimum(lr, lr * (tf.to_float(self._global_step) + 1.0) / 10.0 / tf.to_float(tf.constant(self._curv_win_width) ) )
+    return lr
+
+  def get_cubic_root(self):
+    # We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
+    # where x = sqrt(mu).
+    # We substitute x, which is sqrt(mu), with x = y + 1.
+    # It gives y^3 + py = q
+    # where p = (D^2 h_min^2)/(2*C) and q = -p.
+    # We use the Vieta's substution to compute the root.
+    # There is only one real solution y (which is in [0, 1] ).
+    # http://mathworld.wolfram.com/VietasSubstitution.html
+    # assert_array = \
+    #   [tf.Assert(tf.logical_not(tf.is_nan(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]), 
+    #   tf.Assert(tf.logical_not(tf.is_nan(self._h_min) ), [self._h_min,]), 
+    #   tf.Assert(tf.logical_not(tf.is_nan(self._grad_var) ), [self._grad_var,]),
+    #   tf.Assert(tf.logical_not(tf.is_inf(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]), 
+    #   tf.Assert(tf.logical_not(tf.is_inf(self._h_min) ), [self._h_min,]), 
+    #   tf.Assert(tf.logical_not(tf.is_inf(self._grad_var) ), [self._grad_var,])]
+    # with tf.control_dependencies(assert_array):
+    # EPS in the numerator to prevent momentum being exactly one in case of 0 gradient
+    p = (self._dist_to_opt_avg + EPS)**2 * (self._h_min + EPS)**2 / 2 / (self._grad_var + EPS)
+    w3 = (-tf.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0
+    w = tf.sign(w3) * tf.pow(tf.abs(w3), 1.0/3.0)
+    y = w - p / 3.0 / (w + EPS)
+    x = y + 1
+    return x
+
+  def get_mu_tensor(self):
+    root = self.get_cubic_root()
+    dr = tf.maximum( (self._h_max + EPS) / (self._h_min + EPS), 1.0 + EPS)
+    mu = tf.maximum(
+      root**2, ((tf.sqrt(dr) - 1) / (tf.sqrt(dr) + 1))**2)
+    return mu
+
+  def update_hyper_param(self):
+    assign_hyper_ops = []
+    self._mu = tf.identity(tf.cond(
+      self._do_tune, lambda: self.get_mu_tensor(),
+      lambda: self._mu_var))
+    with tf.control_dependencies([self._mu]):
+      self._lr = tf.identity(tf.cond(
+        self._do_tune, lambda: self.get_lr_tensor(),
+        lambda: self._lr_var))
+
+    with tf.control_dependencies([self._mu, self._lr]):
+      if self._use_unsmoothed_lr_mu:
+        assign_hyper_ops.append(tf.assign(self._mu_var, self._mu) )
+        assign_hyper_ops.append(tf.assign(self._lr_var, self._lr) )
+      else:
+        self._mu = self._beta * self._mu_var + (1 - self._beta) * self._mu
+        self._lr = self._beta * self._lr_var + (1 - self._beta) * self._lr
+        with tf.control_dependencies([self._mu, self._lr] ):
+          assign_hyper_ops.append(tf.assign(self._mu_var, self._mu) )
+          assign_hyper_ops.append(tf.assign(self._lr_var, self._lr) )
+    assign_hyper_op = tf.group(*assign_hyper_ops)
+    return assign_hyper_op
+
+  def get_name(self):
+      return self._optimizer.get_name()
+
+  def apply_gradients(self, grads_tvars, global_step=None, name=None):
+    self._grads, self._tvars = zip(
+      *[(g, t) for g, t in grads_tvars if g is not None])
+
+    # for manual gradient clipping
+    if self._clip_thresh_var is not None:
+      self._grads, self._grads_norm = tf.clip_by_global_norm(
+        self._grads, self._clip_thresh_var)
+
+    # loosely adaptive clipping of gradient in case exploding gradient ruins statistics
+    if self._use_adapt_grad_clip:
+      thresh = tf.cond(self._do_tune, 
+        lambda: tf.sqrt(self._stat_protect_fac * self._adapt_grad_clip_thresh**2),
+        lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
+      self._grads, self._grads_norm = tf.clip_by_global_norm(self._grads, thresh)
+
+    with tf.variable_scope("before_apply"):
+      before_apply_op = self.before_apply()
+
+    with tf.variable_scope("update_hyper"):
+      with tf.control_dependencies([before_apply_op]):
+        update_hyper_op = self.update_hyper_param()
+
+    with tf.variable_scope("apply_updates"):
+      with tf.control_dependencies([update_hyper_op]):
+
+        # clip exploding gradient according to h_max
+        if self._use_adapt_grad_clip:
+          thresh = tf.cond(tf.greater(tf.global_norm(self._grads), 
+            self._adapt_grad_clip_thresh), 
+            lambda: self._adapt_grad_clip_target_val,
+            lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
+          self._grads, self._grads_norm = tf.clip_by_global_norm(
+            self._grads, thresh)
+
+        apply_grad_op = self._optimizer.apply_gradients(
+          zip(self._grads, self._tvars), global_step, name)
+
+    with tf.control_dependencies([apply_grad_op]):
+      self._increment_global_step_op = tf.assign(
+        self._global_step, self._global_step + 1)
+      
+      self._adapt_grad_clip_thresh_op = \
+        tf.assign(self._adapt_grad_clip_thresh, tf.sqrt(self._h_max) )
+      self._adapt_grad_clip_target_val_op = \
+        tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(self._h_max) )
+      # self._adapt_grad_clip_target_val_op = \
+      #   tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(tf.sqrt(self._h_max * self._h_min)))
+
+    return tf.group(before_apply_op, update_hyper_op, apply_grad_op,
+                    self._adapt_grad_clip_thresh_op, self._adapt_grad_clip_target_val_op,
+                    self._increment_global_step_op)
+
+
+  def compute_gradients(self, loss, var_list=None,
+                        gate_gradients=GATE_OP,
+                        aggregation_method=None,
+                        colocate_gradients_with_ops=False,
+                        grad_loss=None):
+    return self._optimizer.compute_gradients(
+      loss, var_list=var_list,
+      gate_gradients=gate_gradients,
+      aggregation_method=aggregation_method,
+      colocate_gradients_with_ops=colocate_gradients_with_ops,
+      grad_loss=grad_loss)
+
+  def minimize(self, loss, global_step=None, var_list=None,
+               gate_gradients=GATE_OP,
+               aggregation_method=None,
+               colocate_gradients_with_ops=False,
+               name=None,
+               grad_loss=None):
+    """Add operations to minimize `loss` by updating `var_list`.
+
+    This method simply combines calls `compute_gradients()` and
+    `apply_gradients()`. If you want to process the gradient before
+    applying them, call `tf.gradients()` and `self.apply_gradients()`
+    explicitly instead of using this function.
+
+    Adapted from Tensorflow Optimizer base class member function.
+    """
+    grads_and_vars = self._optimizer.compute_gradients(
+      loss, var_list=var_list,
+      gate_gradients=gate_gradients,
+      aggregation_method=aggregation_method,
+      colocate_gradients_with_ops=colocate_gradients_with_ops,
+      grad_loss=grad_loss)
+
+    vars_with_grad = [v for g, v in grads_and_vars if g is not None]
+    if not vars_with_grad:
+      raise ValueError(
+        "No gradients provided for any variable, check your graph for "
+        "ops that do not support gradients, between variables "
+        "%s and loss %s." %
+        ([str(v) for _, v in grads_and_vars], loss))
+
+    return self.apply_gradients(grads_and_vars, global_step, name)
+
+  def get_slot(self, var, name):
+    """
+    Return a slot named `name` created for `var` by
+    the underlying MomentumOptimizer.
+
+    Args:
+      var: A variable passed to `minimize()` or `apply_gradients()`.
+      name: A string.
+
+    Returns:
+      The `Variable` for the slot if it was created, `None` otherwise.
+    """
+    return self._optimizer.get_slot(var, name)
+
+  def get_slot_names(self):
+    """
+    Return a list of the names of the slots created by the
+    underlying MomentumOptimizer.
+
+    Returns:
+      A list of strings.
+    """
+    return self._optimizer.get_slot_names()