[687a25]: / rdpg / rdpg.py

Download this file

124 lines (93 with data), 4.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# -----------------------------------
# Recurrent Deep Deterministic Policy Gradient
# Author: Kaizhao Liang
# Date: 08.11.2017
# -----------------------------------
import tensorflow as tf
import numpy as np
from ou_noise import OUNoise
from critic_network import CriticNetwork
from actor_network import ActorNetwork
from replay_buffer import ReplayBuffer
from history import History
# Hyper Parameters:
REPLAY_BUFFER_SIZE = 1000000
REPLAY_START_SIZE = 10000
BATCH_SIZE = 64
GAMMA = 0.8
class RDPG:
"""docstring for RDPG"""
def __init__(self, env):
self.name = 'RDPG' # name for uploading results
self.environment = env
# Randomly initialize actor network and critic network
# with both their target networks
self.state_dim = env.observation_space.shape[0]
self.action_dim = env.action_space.shape[0]
self.sess = tf.InteractiveSession()
self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)
# initialize replay buffer
self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
# Initialize a random process the Ornstein-Uhlenbeck process for action exploration
self.exploration_noise = OUNoise(self.action_dim)
self.saver = tf.train.Saver()
def train(self):
# Sample a random minibatch of N sequences from replay buffer
minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
# Construct histories
observations = []
next_observations = []
actions = []
rewards = []
dones = []
for each in minibatch:
for i in range(1,len(each.observations)):
observations.append(self.pad(each.observations[0:i]))
next_observations.append(self.pad(each.observations[1,i+1]))
actions.append(each.actions[0:i-1])
rewards.append(each.rewards[0:i])
if i == len(each.observations) - 1:
dones.append(True)
else:
dones.append(False)
# Calculate y_batch
next_action_batch = self.actor_network.target_action(observations)
q_value_batch = self.critic_network.target_q(next_observations,[self.pad(i+j) for (i,j) in zip(actions,next_action_batch)])
y_batch = []
for i in range(len(observations)):
if dones[i]:
y_batch.append(rewards[i][-1])
else:
y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i])
y_batch = np.resize(y_batch,[len(observations),1])
# Update critic by minimizing the loss L
self.critic_network.train(y_batch,observations,[self.pad(i) for i in actions])
# Update the actor policy using the sampled gradient:
action_batch_for_gradients = self.actor_network.actions(observations)
q_gradient_batch = self.critic_network.gradients(observations,action_batch_for_gradients)
self.actor_network.train(q_gradient_batch,observations)
# Update the target networks
self.actor_network.update_target()
self.critic_network.update_target()
def save_model(self, path, episode):
self.saver.save(self.sess, path + "modle.ckpt", episode)
def noise_action(self,history):
# Select action a_t according to a sequence of observation and action
action = self.actor_network.action(history)
return action+self.exploration_noise.noise()
def action(self,history):
action = self.actor_network.action(history)
return action
def perceive(self,history):
# Store the history sequence in the replay buffer
self.replay_buffer.add(history)
# Store history to replay start size then start training
if self.replay_buffer.count() > REPLAY_START_SIZE:
self.train()
# Re-iniitialize the random process when an episode ends
if done:
self.exploration_noise.reset()
def pad(self,input):
dim = len(input[0])
return input+[[0]*dim]*(1000-len(input))