--- a +++ b/baselines/ppo2/policies.py @@ -0,0 +1,152 @@ +import numpy as np +import tensorflow as tf +from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm +from baselines.common.distributions import make_pdtype + +def nature_cnn(unscaled_images, **conv_kwargs): + """ + CNN from Nature paper. + """ + scaled_images = tf.cast(unscaled_images, tf.float32) / 255. + activ = tf.nn.relu + h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), + **conv_kwargs)) + h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) + h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) + h3 = conv_to_fc(h3) + return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))) + +class LnLstmPolicy(object): + def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): + nenv = nbatch // nsteps + nh, nw, nc = ob_space.shape + ob_shape = (nbatch, nh, nw, nc) + X = tf.placeholder(tf.uint8, ob_shape) #obs + M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) + S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states + self.pdtype = make_pdtype(ac_space) + with tf.variable_scope("model", reuse=reuse): + h = nature_cnn(X) + xs = batch_to_seq(h, nenv, nsteps) + ms = batch_to_seq(M, nenv, nsteps) + h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) + h5 = seq_to_batch(h5) + vf = fc(h5, 'v', 1) + self.pd, self.pi = self.pdtype.pdfromlatent(h5) + + v0 = vf[:, 0] + a0 = self.pd.sample() + neglogp0 = self.pd.neglogp(a0) + self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) + + def step(ob, state, mask): + return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) + + def value(ob, state, mask): + return sess.run(v0, {X:ob, S:state, M:mask}) + + self.X = X + self.M = M + self.S = S + self.vf = vf + self.step = step + self.value = value + +class LstmPolicy(object): + + def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): + nenv = nbatch // nsteps + + nh, nw, nc = ob_space.shape + ob_shape = (nbatch, nh, nw, nc) + self.pdtype = make_pdtype(ac_space) + X = tf.placeholder(tf.uint8, ob_shape) #obs + M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) + S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states + with tf.variable_scope("model", reuse=reuse): + h = nature_cnn(X) + xs = batch_to_seq(h, nenv, nsteps) + ms = batch_to_seq(M, nenv, nsteps) + h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) + h5 = seq_to_batch(h5) + vf = fc(h5, 'v', 1) + self.pd, self.pi = self.pdtype.pdfromlatent(h5) + + v0 = vf[:, 0] + a0 = self.pd.sample() + neglogp0 = self.pd.neglogp(a0) + self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) + + def step(ob, state, mask): + return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) + + def value(ob, state, mask): + return sess.run(v0, {X:ob, S:state, M:mask}) + + self.X = X + self.M = M + self.S = S + self.vf = vf + self.step = step + self.value = value + +class CnnPolicy(object): + + def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 + nh, nw, nc = ob_space.shape + ob_shape = (nbatch, nh, nw, nc) + self.pdtype = make_pdtype(ac_space) + X = tf.placeholder(tf.uint8, ob_shape) #obs + with tf.variable_scope("model", reuse=reuse): + h = nature_cnn(X, **conv_kwargs) + vf = fc(h, 'v', 1)[:,0] + self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) + + a0 = self.pd.sample() + neglogp0 = self.pd.neglogp(a0) + self.initial_state = None + + def step(ob, *_args, **_kwargs): + a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) + return a, v, self.initial_state, neglogp + + def value(ob, *_args, **_kwargs): + return sess.run(vf, {X:ob}) + + self.X = X + self.vf = vf + self.step = step + self.value = value + +class MlpPolicy(object): + def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 + ob_shape = (nbatch,) + ob_space.shape + self.pdtype = make_pdtype(ac_space) + X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs + with tf.variable_scope("model", reuse=reuse): + activ = tf.tanh + flatten = tf.layers.flatten + pi_h1 = activ(fc(flatten(X), 'pi_fc1', nh=64, init_scale=np.sqrt(2))) + pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) + vf_h1 = activ(fc(flatten(X), 'vf_fc1', nh=64, init_scale=np.sqrt(2))) + vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) + vf = fc(vf_h2, 'vf', 1)[:,0] + + self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) + + + a0 = self.pd.sample() + neglogp0 = self.pd.neglogp(a0) + self.initial_state = None + + def step(ob, *_args, **_kwargs): + a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) + return a, v, self.initial_state, neglogp + + def value(ob, *_args, **_kwargs): + return sess.run(vf, {X:ob}) + + self.X = X + self.vf = vf + self.step = step + self.value = value