Diff of /rdpg/rdpg.py [000000] .. [687a25]

Switch to unified view

a b/rdpg/rdpg.py
1
# -----------------------------------
2
# Recurrent Deep Deterministic Policy Gradient
3
# Author: Kaizhao Liang
4
# Date: 08.11.2017
5
# -----------------------------------
6
import tensorflow as tf
7
import numpy as np
8
from ou_noise import OUNoise
9
from critic_network import CriticNetwork
10
from actor_network import ActorNetwork
11
from replay_buffer import ReplayBuffer
12
from history import History
13
# Hyper Parameters:
14
15
REPLAY_BUFFER_SIZE = 1000000
16
REPLAY_START_SIZE = 10000
17
BATCH_SIZE = 64
18
GAMMA = 0.8
19
20
21
class RDPG:
22
    """docstring for RDPG"""
23
    def __init__(self, env):
24
        self.name = 'RDPG' # name for uploading results
25
        self.environment = env
26
        # Randomly initialize actor network and critic network
27
        # with both their target networks
28
        self.state_dim = env.observation_space.shape[0]
29
        self.action_dim = env.action_space.shape[0]
30
31
        self.sess = tf.InteractiveSession()
32
33
        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
34
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)
35
36
        # initialize replay buffer
37
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
38
39
        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
40
        self.exploration_noise = OUNoise(self.action_dim)
41
42
        self.saver = tf.train.Saver()
43
44
    def train(self):
45
        # Sample a random minibatch of N sequences from replay buffer
46
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
47
        # Construct histories
48
        observations = []
49
        next_observations = []
50
        actions = []
51
        rewards = []
52
        dones = []
53
        for each in minibatch:
54
            for i in range(1,len(each.observations)):
55
                observations.append(self.pad(each.observations[0:i]))
56
                next_observations.append(self.pad(each.observations[1,i+1]))
57
                actions.append(each.actions[0:i-1])
58
                rewards.append(each.rewards[0:i])
59
                if i == len(each.observations) - 1:
60
                    dones.append(True)
61
                else:
62
                    dones.append(False)
63
        # Calculate y_batch
64
        next_action_batch = self.actor_network.target_action(observations)
65
        q_value_batch = self.critic_network.target_q(next_observations,[self.pad(i+j) for (i,j) in zip(actions,next_action_batch)])
66
        y_batch = []
67
        for i in range(len(observations)):
68
            if dones[i]:
69
                y_batch.append(rewards[i][-1])
70
            else:
71
                y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i])
72
        y_batch = np.resize(y_batch,[len(observations),1])
73
        # Update critic by minimizing the loss L
74
        self.critic_network.train(y_batch,observations,[self.pad(i) for i in actions])
75
76
        # Update the actor policy using the sampled gradient:
77
        action_batch_for_gradients = self.actor_network.actions(observations)
78
        q_gradient_batch = self.critic_network.gradients(observations,action_batch_for_gradients)
79
80
        self.actor_network.train(q_gradient_batch,observations)
81
82
        # Update the target networks
83
        self.actor_network.update_target()
84
        self.critic_network.update_target()
85
86
    def save_model(self, path, episode):
87
        self.saver.save(self.sess, path + "modle.ckpt", episode)
88
89
90
    def noise_action(self,history):
91
        # Select action a_t according to a sequence of observation and action
92
        action = self.actor_network.action(history)
93
        return action+self.exploration_noise.noise()
94
95
    def action(self,history):
96
        action = self.actor_network.action(history)
97
        return action
98
99
    def perceive(self,history):
100
        # Store the history sequence in the replay buffer
101
        self.replay_buffer.add(history)
102
103
        # Store history to replay start size then start training
104
        if self.replay_buffer.count() >  REPLAY_START_SIZE:
105
            self.train()
106
107
        # Re-iniitialize the random process when an episode ends
108
        if done:
109
            self.exploration_noise.reset()
110
111
    def pad(self,input):
112
        dim = len(input[0])
113
        return input+[[0]*dim]*(1000-len(input))
114
115
116
117
118
119
120
121
122
123