[687a25]: / ddpg / ddpg.py

Download this file

158 lines (116 with data), 5.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# -----------------------------------
# Deep Deterministic Policy Gradient
# Author: Kaizhao Liang
# Date: 08.11.2017
# -----------------------------------
import tensorflow as tf
import numpy as np
from ou_noise import OUNoise
from critic_network import CriticNetwork
from actor_network import ActorNetwork
from replay_buffer import ReplayBuffer
from helper import *
# Hyper Parameters:
REPLAY_BUFFER_SIZE = 1000000
REPLAY_START_SIZE = 10000
BATCH_SIZE = 64
GAMMA = 0.995
def make_session(num_cpu):
"""Returns a session that will use <num_cpu> CPU's only"""
tf_config = tf.ConfigProto(
inter_op_parallelism_threads=num_cpu,
intra_op_parallelism_threads=num_cpu,
device_count = {'GPU': 0})
return tf.InteractiveSession(config=tf_config)
class DDPG:
"""docstring for DDPG"""
def __init__(self, env):
self.name = 'DDPG' # name for uploading results
self.environment = env
# Randomly initialize actor network and critic network
# with both their target networks
self.state_dim = 58#env.observation_space.shape[0]
self.action_dim = 18#env.action_space.shape[0]
self.atoms = 11
self.v_max = 5
self.v_min = -5
self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1.)
self.z = np.tile(np.asarray([self.v_min + i * self.delta_z for i in range(self.atoms)]).astype(np.float32),(BATCH_SIZE,1)) # shape (BATCH_SIZE,atoms)
self.sess = make_session(3)
self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim,self.atoms,self.z)
# initialize replay buffer
self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
# Initialize a random process the Ornstein-Uhlenbeck process for action exploration
self.exploration_noise = OUNoise(self.action_dim)
self.saver = tf.train.Saver()
def train(self):
#print "train step",self.time_step
# Sample a random minibatch of N transitions from replay buffer
minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
state_batch = np.asarray([data[0] for data in minibatch])
action_batch = np.asarray([data[1] for data in minibatch])
reward_batch = np.asarray([data[2] for data in minibatch])
next_state_batch = np.asarray([data[3] for data in minibatch])
done_batch = np.asarray([data[4] for data in minibatch])
# for action_dim = 1
action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim])
# Calculate y_batch
next_action_batch = self.actor_network.target_actions(next_state_batch)
q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch)
done_batch = np.asarray([0. if done else 1. for done in done_batch])
Tz = np.minimum(self.v_max, np.maximum(self.v_min,reward_batch[:,np.newaxis] + GAMMA * self.z * done_batch[:,np.newaxis]))
b = (Tz - self.v_min) / self.delta_z
l,u = np.floor(b+1e-3).astype(int),np.ceil(b-1e-3).astype(int)
#print(l)
#print(u)
p = q_value_batch
m_batch = np.zeros((BATCH_SIZE,self.atoms))
A = p * (u - b)
B = p * (b - l)
for i in range(BATCH_SIZE):
for j in range(self.atoms):
m_batch[i,l[i,j]] += A[i,j]
m_batch[i,u[i,j]] += B[i,j]
# Update critic by minimizing the loss L
self.critic_network.train(m_batch.astype(np.float32),state_batch,action_batch)
# Update the actor policy using the sampled gradient:
action_batch_for_gradients = self.actor_network.actions(state_batch)
q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)
q_gradient_batch *= -1.
#print(q_gradient_batch)
# invert gradient formula : dq = (a_max-a) / (a_max - a_min) if dq>0, else dq = (a - a_min) / (a_max - a_min)
for i in range(BATCH_SIZE): # In our case a_max = 1, a_min = 0
for j in range(18):
dq = q_gradient_batch[i,j]
a = action_batch_for_gradients[i,j]
if dq > 0.:
q_gradient_batch[i,j] *= (0.95-a)
else:
q_gradient_batch[i,j] *= a-0.05
self.actor_network.train(q_gradient_batch,state_batch)
# Update the target networks
self.actor_network.update_target()
self.critic_network.update_target()
def save_model(self, path, episode):
#if self.episode % 10 == 1:
self.saver.save(self.sess, path + "modle.ckpt", episode)
def noise_action(self,state):
# Select action a_t according to the current policy and exploration noise
action = self.actor_network.action(state)
return action+self.exploration_noise.noise()
def action(self,state):
action = self.actor_network.action(state)
return action
def perceive(self,state,action,reward,next_state,done):
# Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
self.replay_buffer.add(state,action,reward,next_state,done)
# Store transitions to replay start size then start training
if self.replay_buffer.count() > REPLAY_START_SIZE:
self.train()
#if self.time_step % 10000 == 0:
#self.actor_network.save_network(self.time_step)
#self.critic_network.save_network(self.time_step)
# Re-iniitialize the random process when an episode ends
if done:
self.exploration_noise.reset()