--- a +++ b/rdpg/rdpg.py @@ -0,0 +1,123 @@ +# ----------------------------------- +# Recurrent Deep Deterministic Policy Gradient +# Author: Kaizhao Liang +# Date: 08.11.2017 +# ----------------------------------- +import tensorflow as tf +import numpy as np +from ou_noise import OUNoise +from critic_network import CriticNetwork +from actor_network import ActorNetwork +from replay_buffer import ReplayBuffer +from history import History +# Hyper Parameters: + +REPLAY_BUFFER_SIZE = 1000000 +REPLAY_START_SIZE = 10000 +BATCH_SIZE = 64 +GAMMA = 0.8 + + +class RDPG: + """docstring for RDPG""" + def __init__(self, env): + self.name = 'RDPG' # name for uploading results + self.environment = env + # Randomly initialize actor network and critic network + # with both their target networks + self.state_dim = env.observation_space.shape[0] + self.action_dim = env.action_space.shape[0] + + self.sess = tf.InteractiveSession() + + self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) + self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) + + # initialize replay buffer + self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) + + # Initialize a random process the Ornstein-Uhlenbeck process for action exploration + self.exploration_noise = OUNoise(self.action_dim) + + self.saver = tf.train.Saver() + + def train(self): + # Sample a random minibatch of N sequences from replay buffer + minibatch = self.replay_buffer.get_batch(BATCH_SIZE) + # Construct histories + observations = [] + next_observations = [] + actions = [] + rewards = [] + dones = [] + for each in minibatch: + for i in range(1,len(each.observations)): + observations.append(self.pad(each.observations[0:i])) + next_observations.append(self.pad(each.observations[1,i+1])) + actions.append(each.actions[0:i-1]) + rewards.append(each.rewards[0:i]) + if i == len(each.observations) - 1: + dones.append(True) + else: + dones.append(False) + # Calculate y_batch + next_action_batch = self.actor_network.target_action(observations) + q_value_batch = self.critic_network.target_q(next_observations,[self.pad(i+j) for (i,j) in zip(actions,next_action_batch)]) + y_batch = [] + for i in range(len(observations)): + if dones[i]: + y_batch.append(rewards[i][-1]) + else: + y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i]) + y_batch = np.resize(y_batch,[len(observations),1]) + # Update critic by minimizing the loss L + self.critic_network.train(y_batch,observations,[self.pad(i) for i in actions]) + + # Update the actor policy using the sampled gradient: + action_batch_for_gradients = self.actor_network.actions(observations) + q_gradient_batch = self.critic_network.gradients(observations,action_batch_for_gradients) + + self.actor_network.train(q_gradient_batch,observations) + + # Update the target networks + self.actor_network.update_target() + self.critic_network.update_target() + + def save_model(self, path, episode): + self.saver.save(self.sess, path + "modle.ckpt", episode) + + + def noise_action(self,history): + # Select action a_t according to a sequence of observation and action + action = self.actor_network.action(history) + return action+self.exploration_noise.noise() + + def action(self,history): + action = self.actor_network.action(history) + return action + + def perceive(self,history): + # Store the history sequence in the replay buffer + self.replay_buffer.add(history) + + # Store history to replay start size then start training + if self.replay_buffer.count() > REPLAY_START_SIZE: + self.train() + + # Re-iniitialize the random process when an episode ends + if done: + self.exploration_noise.reset() + + def pad(self,input): + dim = len(input[0]) + return input+[[0]*dim]*(1000-len(input)) + + + + + + + + + +