|
a |
|
b/rdpg/rdpg.py |
|
|
1 |
# ----------------------------------- |
|
|
2 |
# Recurrent Deep Deterministic Policy Gradient |
|
|
3 |
# Author: Kaizhao Liang |
|
|
4 |
# Date: 08.11.2017 |
|
|
5 |
# ----------------------------------- |
|
|
6 |
import tensorflow as tf |
|
|
7 |
import numpy as np |
|
|
8 |
from ou_noise import OUNoise |
|
|
9 |
from critic_network import CriticNetwork |
|
|
10 |
from actor_network import ActorNetwork |
|
|
11 |
from replay_buffer import ReplayBuffer |
|
|
12 |
from history import History |
|
|
13 |
# Hyper Parameters: |
|
|
14 |
|
|
|
15 |
REPLAY_BUFFER_SIZE = 1000000 |
|
|
16 |
REPLAY_START_SIZE = 10000 |
|
|
17 |
BATCH_SIZE = 64 |
|
|
18 |
GAMMA = 0.8 |
|
|
19 |
|
|
|
20 |
|
|
|
21 |
class RDPG: |
|
|
22 |
"""docstring for RDPG""" |
|
|
23 |
def __init__(self, env): |
|
|
24 |
self.name = 'RDPG' # name for uploading results |
|
|
25 |
self.environment = env |
|
|
26 |
# Randomly initialize actor network and critic network |
|
|
27 |
# with both their target networks |
|
|
28 |
self.state_dim = env.observation_space.shape[0] |
|
|
29 |
self.action_dim = env.action_space.shape[0] |
|
|
30 |
|
|
|
31 |
self.sess = tf.InteractiveSession() |
|
|
32 |
|
|
|
33 |
self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) |
|
|
34 |
self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) |
|
|
35 |
|
|
|
36 |
# initialize replay buffer |
|
|
37 |
self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) |
|
|
38 |
|
|
|
39 |
# Initialize a random process the Ornstein-Uhlenbeck process for action exploration |
|
|
40 |
self.exploration_noise = OUNoise(self.action_dim) |
|
|
41 |
|
|
|
42 |
self.saver = tf.train.Saver() |
|
|
43 |
|
|
|
44 |
def train(self): |
|
|
45 |
# Sample a random minibatch of N sequences from replay buffer |
|
|
46 |
minibatch = self.replay_buffer.get_batch(BATCH_SIZE) |
|
|
47 |
# Construct histories |
|
|
48 |
observations = [] |
|
|
49 |
next_observations = [] |
|
|
50 |
actions = [] |
|
|
51 |
rewards = [] |
|
|
52 |
dones = [] |
|
|
53 |
for each in minibatch: |
|
|
54 |
for i in range(1,len(each.observations)): |
|
|
55 |
observations.append(self.pad(each.observations[0:i])) |
|
|
56 |
next_observations.append(self.pad(each.observations[1,i+1])) |
|
|
57 |
actions.append(each.actions[0:i-1]) |
|
|
58 |
rewards.append(each.rewards[0:i]) |
|
|
59 |
if i == len(each.observations) - 1: |
|
|
60 |
dones.append(True) |
|
|
61 |
else: |
|
|
62 |
dones.append(False) |
|
|
63 |
# Calculate y_batch |
|
|
64 |
next_action_batch = self.actor_network.target_action(observations) |
|
|
65 |
q_value_batch = self.critic_network.target_q(next_observations,[self.pad(i+j) for (i,j) in zip(actions,next_action_batch)]) |
|
|
66 |
y_batch = [] |
|
|
67 |
for i in range(len(observations)): |
|
|
68 |
if dones[i]: |
|
|
69 |
y_batch.append(rewards[i][-1]) |
|
|
70 |
else: |
|
|
71 |
y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i]) |
|
|
72 |
y_batch = np.resize(y_batch,[len(observations),1]) |
|
|
73 |
# Update critic by minimizing the loss L |
|
|
74 |
self.critic_network.train(y_batch,observations,[self.pad(i) for i in actions]) |
|
|
75 |
|
|
|
76 |
# Update the actor policy using the sampled gradient: |
|
|
77 |
action_batch_for_gradients = self.actor_network.actions(observations) |
|
|
78 |
q_gradient_batch = self.critic_network.gradients(observations,action_batch_for_gradients) |
|
|
79 |
|
|
|
80 |
self.actor_network.train(q_gradient_batch,observations) |
|
|
81 |
|
|
|
82 |
# Update the target networks |
|
|
83 |
self.actor_network.update_target() |
|
|
84 |
self.critic_network.update_target() |
|
|
85 |
|
|
|
86 |
def save_model(self, path, episode): |
|
|
87 |
self.saver.save(self.sess, path + "modle.ckpt", episode) |
|
|
88 |
|
|
|
89 |
|
|
|
90 |
def noise_action(self,history): |
|
|
91 |
# Select action a_t according to a sequence of observation and action |
|
|
92 |
action = self.actor_network.action(history) |
|
|
93 |
return action+self.exploration_noise.noise() |
|
|
94 |
|
|
|
95 |
def action(self,history): |
|
|
96 |
action = self.actor_network.action(history) |
|
|
97 |
return action |
|
|
98 |
|
|
|
99 |
def perceive(self,history): |
|
|
100 |
# Store the history sequence in the replay buffer |
|
|
101 |
self.replay_buffer.add(history) |
|
|
102 |
|
|
|
103 |
# Store history to replay start size then start training |
|
|
104 |
if self.replay_buffer.count() > REPLAY_START_SIZE: |
|
|
105 |
self.train() |
|
|
106 |
|
|
|
107 |
# Re-iniitialize the random process when an episode ends |
|
|
108 |
if done: |
|
|
109 |
self.exploration_noise.reset() |
|
|
110 |
|
|
|
111 |
def pad(self,input): |
|
|
112 |
dim = len(input[0]) |
|
|
113 |
return input+[[0]*dim]*(1000-len(input)) |
|
|
114 |
|
|
|
115 |
|
|
|
116 |
|
|
|
117 |
|
|
|
118 |
|
|
|
119 |
|
|
|
120 |
|
|
|
121 |
|
|
|
122 |
|
|
|
123 |
|