--- a +++ b/baselines/a2c/utils.py @@ -0,0 +1,290 @@ +import os +import gym +import numpy as np +import tensorflow as tf +from gym import spaces +from collections import deque + +def sample(logits): + noise = tf.random_uniform(tf.shape(logits)) + return tf.argmax(logits - tf.log(-tf.log(noise)), 1) + +def cat_entropy(logits): + a0 = logits - tf.reduce_max(logits, 1, keep_dims=True) + ea0 = tf.exp(a0) + z0 = tf.reduce_sum(ea0, 1, keep_dims=True) + p0 = ea0 / z0 + return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1) + +def cat_entropy_softmax(p0): + return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1) + +def mse(pred, target): + return tf.square(pred-target)/2. + +def ortho_init(scale=1.0): + def _ortho_init(shape, dtype, partition_info=None): + #lasagne ortho init for tf + shape = tuple(shape) + if len(shape) == 2: + flat_shape = shape + elif len(shape) == 4: # assumes NHWC + flat_shape = (np.prod(shape[:-1]), shape[-1]) + else: + raise NotImplementedError + a = np.random.normal(0.0, 1.0, flat_shape) + u, _, v = np.linalg.svd(a, full_matrices=False) + q = u if u.shape == flat_shape else v # pick the one with the correct shape + q = q.reshape(shape) + return (scale * q[:shape[0], :shape[1]]).astype(np.float32) + return _ortho_init + +def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0, data_format='NHWC', one_dim_bias=False): + if data_format == 'NHWC': + channel_ax = 3 + strides = [1, stride, stride, 1] + bshape = [1, 1, 1, nf] + elif data_format == 'NCHW': + channel_ax = 1 + strides = [1, 1, stride, stride] + bshape = [1, nf, 1, 1] + else: + raise NotImplementedError + bias_var_shape = [nf] if one_dim_bias else [1, nf, 1, 1] + nin = x.get_shape()[channel_ax].value + wshape = [rf, rf, nin, nf] + with tf.variable_scope(scope): + w = tf.get_variable("w", wshape, initializer=ortho_init(init_scale)) + b = tf.get_variable("b", bias_var_shape, initializer=tf.constant_initializer(0.0)) + if not one_dim_bias and data_format == 'NHWC': + b = tf.reshape(b, bshape) + return b + tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format) + +def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0): + with tf.variable_scope(scope): + nin = x.get_shape()[1].value + w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale)) + b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(init_bias)) + return tf.matmul(x, w)+b + +def batch_to_seq(h, nbatch, nsteps, flat=False): + if flat: + h = tf.reshape(h, [nbatch, nsteps]) + else: + h = tf.reshape(h, [nbatch, nsteps, -1]) + return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps, value=h)] + +def seq_to_batch(h, flat = False): + shape = h[0].get_shape().as_list() + if not flat: + assert(len(shape) > 1) + nh = h[0].get_shape()[-1].value + return tf.reshape(tf.concat(axis=1, values=h), [-1, nh]) + else: + return tf.reshape(tf.stack(values=h, axis=1), [-1]) + +def lstm(xs, ms, s, scope, nh, init_scale=1.0): + nbatch, nin = [v.value for v in xs[0].get_shape()] + nsteps = len(xs) + with tf.variable_scope(scope): + wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale)) + wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale)) + b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0)) + + c, h = tf.split(axis=1, num_or_size_splits=2, value=s) + for idx, (x, m) in enumerate(zip(xs, ms)): + c = c*(1-m) + h = h*(1-m) + z = tf.matmul(x, wx) + tf.matmul(h, wh) + b + i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z) + i = tf.nn.sigmoid(i) + f = tf.nn.sigmoid(f) + o = tf.nn.sigmoid(o) + u = tf.tanh(u) + c = f*c + i*u + h = o*tf.tanh(c) + xs[idx] = h + s = tf.concat(axis=1, values=[c, h]) + return xs, s + +def _ln(x, g, b, e=1e-5, axes=[1]): + u, s = tf.nn.moments(x, axes=axes, keep_dims=True) + x = (x-u)/tf.sqrt(s+e) + x = x*g+b + return x + +def lnlstm(xs, ms, s, scope, nh, init_scale=1.0): + nbatch, nin = [v.value for v in xs[0].get_shape()] + nsteps = len(xs) + with tf.variable_scope(scope): + wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale)) + gx = tf.get_variable("gx", [nh*4], initializer=tf.constant_initializer(1.0)) + bx = tf.get_variable("bx", [nh*4], initializer=tf.constant_initializer(0.0)) + + wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale)) + gh = tf.get_variable("gh", [nh*4], initializer=tf.constant_initializer(1.0)) + bh = tf.get_variable("bh", [nh*4], initializer=tf.constant_initializer(0.0)) + + b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0)) + + gc = tf.get_variable("gc", [nh], initializer=tf.constant_initializer(1.0)) + bc = tf.get_variable("bc", [nh], initializer=tf.constant_initializer(0.0)) + + c, h = tf.split(axis=1, num_or_size_splits=2, value=s) + for idx, (x, m) in enumerate(zip(xs, ms)): + c = c*(1-m) + h = h*(1-m) + z = _ln(tf.matmul(x, wx), gx, bx) + _ln(tf.matmul(h, wh), gh, bh) + b + i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z) + i = tf.nn.sigmoid(i) + f = tf.nn.sigmoid(f) + o = tf.nn.sigmoid(o) + u = tf.tanh(u) + c = f*c + i*u + h = o*tf.tanh(_ln(c, gc, bc)) + xs[idx] = h + s = tf.concat(axis=1, values=[c, h]) + return xs, s + +def conv_to_fc(x): + nh = np.prod([v.value for v in x.get_shape()[1:]]) + x = tf.reshape(x, [-1, nh]) + return x + +def discount_with_dones(rewards, dones, gamma): + discounted = [] + r = 0 + for reward, done in zip(rewards[::-1], dones[::-1]): + r = reward + gamma*r*(1.-done) # fixed off by one bug + discounted.append(r) + return discounted[::-1] + +def find_trainable_variables(key): + with tf.variable_scope(key): + return tf.trainable_variables() + +def make_path(f): + return os.makedirs(f, exist_ok=True) + +def constant(p): + return 1 + +def linear(p): + return 1-p + +def middle_drop(p): + eps = 0.75 + if 1-p<eps: + return eps*0.1 + return 1-p + +def double_linear_con(p): + p *= 2 + eps = 0.125 + if 1-p<eps: + return eps + return 1-p + +def double_middle_drop(p): + eps1 = 0.75 + eps2 = 0.25 + if 1-p<eps1: + if 1-p<eps2: + return eps2*0.5 + return eps1*0.1 + return 1-p + +schedules = { + 'linear':linear, + 'constant':constant, + 'double_linear_con': double_linear_con, + 'middle_drop': middle_drop, + 'double_middle_drop': double_middle_drop +} + +class Scheduler(object): + + def __init__(self, v, nvalues, schedule): + self.n = 0. + self.v = v + self.nvalues = nvalues + self.schedule = schedules[schedule] + + def value(self): + current_value = self.v*self.schedule(self.n/self.nvalues) + self.n += 1. + return current_value + + def value_steps(self, steps): + return self.v*self.schedule(steps/self.nvalues) + + +class EpisodeStats: + def __init__(self, nsteps, nenvs): + self.episode_rewards = [] + for i in range(nenvs): + self.episode_rewards.append([]) + self.lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths + self.rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards + self.nsteps = nsteps + self.nenvs = nenvs + + def feed(self, rewards, masks): + rewards = np.reshape(rewards, [self.nenvs, self.nsteps]) + masks = np.reshape(masks, [self.nenvs, self.nsteps]) + for i in range(0, self.nenvs): + for j in range(0, self.nsteps): + self.episode_rewards[i].append(rewards[i][j]) + if masks[i][j]: + l = len(self.episode_rewards[i]) + s = sum(self.episode_rewards[i]) + self.lenbuffer.append(l) + self.rewbuffer.append(s) + self.episode_rewards[i] = [] + + def mean_length(self): + if self.lenbuffer: + return np.mean(self.lenbuffer) + else: + return 0 # on the first params dump, no episodes are finished + + def mean_reward(self): + if self.rewbuffer: + return np.mean(self.rewbuffer) + else: + return 0 + + +# For ACER +def get_by_index(x, idx): + assert(len(x.get_shape()) == 2) + assert(len(idx.get_shape()) == 1) + idx_flattened = tf.range(0, x.shape[0]) * x.shape[1] + idx + y = tf.gather(tf.reshape(x, [-1]), # flatten input + idx_flattened) # use flattened indices + return y + +def check_shape(ts,shapes): + i = 0 + for (t,shape) in zip(ts,shapes): + assert t.get_shape().as_list()==shape, "id " + str(i) + " shape " + str(t.get_shape()) + str(shape) + i += 1 + +def avg_norm(t): + return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(t), axis=-1))) + +def gradient_add(g1, g2, param): + print([g1, g2, param.name]) + assert (not (g1 is None and g2 is None)), param.name + if g1 is None: + return g2 + elif g2 is None: + return g1 + else: + return g1 + g2 + +def q_explained_variance(qpred, q): + _, vary = tf.nn.moments(q, axes=[0, 1]) + _, varpred = tf.nn.moments(q - qpred, axes=[0, 1]) + check_shape([vary, varpred], [[]] * 2) + return 1.0 - (varpred / vary)