|
a |
|
b/A3C/model (lstm).py |
|
|
1 |
from ou_noise import OUNoise |
|
|
2 |
|
|
|
3 |
from helper import * |
|
|
4 |
|
|
|
5 |
import opensim as osim |
|
|
6 |
from osim.http.client import Client |
|
|
7 |
from osim.env import * |
|
|
8 |
|
|
|
9 |
import threading |
|
|
10 |
import numpy as np |
|
|
11 |
import tensorflow as tf |
|
|
12 |
import tensorflow.contrib.slim as slim |
|
|
13 |
|
|
|
14 |
from helper import * |
|
|
15 |
|
|
|
16 |
from time import sleep |
|
|
17 |
from time import time |
|
|
18 |
from time import gmtime, strftime |
|
|
19 |
import multiprocessing |
|
|
20 |
from multiprocessing import Process, Pipe |
|
|
21 |
from osim.env import * |
|
|
22 |
|
|
|
23 |
# [Hacked] the memory might always be leaking, here's a solution #58 |
|
|
24 |
# https://github.com/stanfordnmbl/osim-rl/issues/58 |
|
|
25 |
# separate process that holds a separate RunEnv instance. |
|
|
26 |
# This has to be done since RunEnv() in the same process result in interleaved running of simulations. |
|
|
27 |
def floatify(np): |
|
|
28 |
return [float(np[i]) for i in range(len(np))] |
|
|
29 |
|
|
|
30 |
def standalone_headless_isolated(conn,vis): |
|
|
31 |
e = RunEnv(visualize=vis) |
|
|
32 |
|
|
|
33 |
while True: |
|
|
34 |
try: |
|
|
35 |
msg = conn.recv() |
|
|
36 |
|
|
|
37 |
# messages should be tuples, |
|
|
38 |
# msg[0] should be string |
|
|
39 |
|
|
|
40 |
if msg[0] == 'reset': |
|
|
41 |
o = e.reset(difficulty=2) |
|
|
42 |
conn.send(o) |
|
|
43 |
elif msg[0] == 'step': |
|
|
44 |
ordi = e.step(msg[1]) |
|
|
45 |
conn.send(ordi) |
|
|
46 |
else: |
|
|
47 |
conn.close() |
|
|
48 |
del e |
|
|
49 |
return |
|
|
50 |
except: |
|
|
51 |
conn.close() |
|
|
52 |
del e |
|
|
53 |
raise |
|
|
54 |
|
|
|
55 |
# class that manages the interprocess communication and expose itself as a RunEnv. |
|
|
56 |
class ei: # Environment Instance |
|
|
57 |
def __init__(self,vis): |
|
|
58 |
self.pc, self.cc = Pipe() |
|
|
59 |
self.p = Process( |
|
|
60 |
target = standalone_headless_isolated, |
|
|
61 |
args=(self.cc,vis,) |
|
|
62 |
) |
|
|
63 |
self.p.daemon = True |
|
|
64 |
self.p.start() |
|
|
65 |
|
|
|
66 |
def reset(self): |
|
|
67 |
self.pc.send(('reset',)) |
|
|
68 |
return self.pc.recv() |
|
|
69 |
|
|
|
70 |
def step(self,actions): |
|
|
71 |
self.pc.send(('step',actions,)) |
|
|
72 |
try: |
|
|
73 |
return self.pc.recv() |
|
|
74 |
except EOFError: |
|
|
75 |
return None |
|
|
76 |
|
|
|
77 |
def __del__(self): |
|
|
78 |
self.pc.send(('exit',)) |
|
|
79 |
#print('(ei)waiting for join...') |
|
|
80 |
self.p.join() |
|
|
81 |
|
|
|
82 |
# Added by Andrew Liao |
|
|
83 |
# for NoisyNet-DQN (using Factorised Gaussian noise) |
|
|
84 |
# modified from ```dense``` function |
|
|
85 |
def sample_noise(shape): |
|
|
86 |
noise = np.random.normal(size=shape).astype(np.float32) |
|
|
87 |
#noise = np.ones(size=shape).astype(np.float32) # whenever not in training, simply return a matrix of ones. |
|
|
88 |
return noise |
|
|
89 |
|
|
|
90 |
global_p_a = 0. |
|
|
91 |
global_q_a = 0. |
|
|
92 |
global_p_v = 0. |
|
|
93 |
global_q_v = 0. |
|
|
94 |
|
|
|
95 |
def noisy_dense(x, size, name, bias=True, activation_fn=tf.identity, factorized=False): |
|
|
96 |
|
|
|
97 |
global global_p_a |
|
|
98 |
global global_q_a |
|
|
99 |
global global_p_v |
|
|
100 |
global global_q_v |
|
|
101 |
# https://arxiv.org/pdf/1706.10295.pdf page 4 |
|
|
102 |
# the function used in eq.7,8 : f(x)=sgn(x)*sqrt(|x|) |
|
|
103 |
def f(x): |
|
|
104 |
return tf.multiply(tf.sign(x), tf.pow(tf.abs(x), 0.5)) |
|
|
105 |
# Initializer of \mu and \sigma |
|
|
106 |
mu_init = tf.random_uniform_initializer(minval=-1*1/np.power(x.get_shape().as_list()[1], 0.5), |
|
|
107 |
maxval=1*1/np.power(x.get_shape().as_list()[1], 0.5)) |
|
|
108 |
sigma_init = tf.constant_initializer(0.4/np.power(x.get_shape().as_list()[1], 0.5)) |
|
|
109 |
# Sample noise from gaussian |
|
|
110 |
if name == 'global': |
|
|
111 |
if size == 18: # check condition |
|
|
112 |
p = sample_noise([128, 18]) # 256 is rnn_size |
|
|
113 |
global_p_a = p |
|
|
114 |
q = sample_noise([1, 18]) # 3 is action size |
|
|
115 |
global_q_a = q |
|
|
116 |
else: |
|
|
117 |
p = sample_noise([128, 1]) # 256 is rnn_size |
|
|
118 |
global_p_v = p |
|
|
119 |
q = sample_noise([1, 1]) # 1 is value size |
|
|
120 |
global_q_v = q |
|
|
121 |
else: # for actors, copy p & q from the global network |
|
|
122 |
if size == 3: # check condition |
|
|
123 |
p = global_p_a |
|
|
124 |
q = global_q_a |
|
|
125 |
else: |
|
|
126 |
p = global_p_v |
|
|
127 |
q = global_q_v |
|
|
128 |
|
|
|
129 |
f_p = f(p); f_q = f(q) |
|
|
130 |
w_epsilon = f_p*f_q; b_epsilon = tf.squeeze(f_q) |
|
|
131 |
if not factorized: # just resample the noisy matrix to get independent guassian noise |
|
|
132 |
w_epsilon = tf.identity(sample_noise(w_epsilon.get_shape().as_list())) |
|
|
133 |
# w = w_mu + w_sigma*w_epsilon |
|
|
134 |
options = {18:'action',1:'value'} |
|
|
135 |
w_mu = tf.get_variable(name + "/w_mu" + options[size], [x.get_shape()[1], size], initializer=mu_init) |
|
|
136 |
w_sigma = tf.get_variable(name + "/w_sigma" + options[size], [x.get_shape()[1], size], initializer=sigma_init) |
|
|
137 |
w = w_mu + tf.multiply(w_sigma, w_epsilon) |
|
|
138 |
ret = tf.matmul(x, w) |
|
|
139 |
if bias: |
|
|
140 |
# b = b_mu + b_sigma*b_epsilon |
|
|
141 |
b_mu = tf.get_variable(name + "/b_mu" + options[size], [size], initializer=mu_init) |
|
|
142 |
b_sigma = tf.get_variable(name + "/b_sigma" + options[size], [size], initializer=sigma_init) |
|
|
143 |
b = b_mu + tf.multiply(b_sigma, b_epsilon) |
|
|
144 |
return activation_fn(ret + b) |
|
|
145 |
else: |
|
|
146 |
return activation_fn(ret) |
|
|
147 |
|
|
|
148 |
# ================================================================ |
|
|
149 |
# Model components |
|
|
150 |
# ================================================================ |
|
|
151 |
|
|
|
152 |
# Actor Network------------------------------------------------------------------------------------------------------------ |
|
|
153 |
class AC_Network(): |
|
|
154 |
def __init__(self,s_size,a_size,scope,trainer,noisy): |
|
|
155 |
with tf.variable_scope(scope): |
|
|
156 |
#Input and visual encoding layers |
|
|
157 |
self.inputs = tf.placeholder(shape=[None,s_size],dtype=tf.float32) |
|
|
158 |
self.imageIn = tf.reshape(self.inputs,shape=[-1,s_size,1]) |
|
|
159 |
|
|
|
160 |
# Create the model, use the default arg scope to configure the batch norm parameters. |
|
|
161 |
''' conv1 = tf.nn.elu(tf.nn.conv1d(self.imageIn,tf.truncated_normal([2,1,8],stddev=0.1),2,padding='VALID')) |
|
|
162 |
conv2 = tf.nn.elu(tf.nn.conv1d(conv1,tf.truncated_normal([3,8,16],stddev=0.05),1,padding='VALID')) |
|
|
163 |
|
|
|
164 |
hidden = slim.fully_connected(slim.flatten(conv2),200,activation_fn=tf.nn.elu)''' |
|
|
165 |
|
|
|
166 |
hidden = slim.fully_connected(slim.flatten(self.imageIn),300,activation_fn=tf.elu) |
|
|
167 |
#Recurrent network for temporal dependencies |
|
|
168 |
lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(128,dropout_keep_prob=0.8) |
|
|
169 |
#lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell,output_keep_prob=0.5) |
|
|
170 |
c_init = np.zeros((1, lstm_cell.state_size.c), np.float32) |
|
|
171 |
h_init = np.zeros((1, lstm_cell.state_size.h), np.float32) |
|
|
172 |
self.state_init = [c_init, h_init] |
|
|
173 |
c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c]) |
|
|
174 |
h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h]) |
|
|
175 |
self.state_in = (c_in, h_in) |
|
|
176 |
rnn_in = tf.expand_dims(hidden, [0]) |
|
|
177 |
step_size = tf.shape(self.imageIn)[:1] |
|
|
178 |
state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in) |
|
|
179 |
lstm_outputs, lstm_state = tf.nn.dynamic_rnn( |
|
|
180 |
lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, |
|
|
181 |
time_major=False) |
|
|
182 |
lstm_c, lstm_h = lstm_state |
|
|
183 |
self.state_out = (lstm_c[:1, :], lstm_h[:1, :]) |
|
|
184 |
rnn_out = tf.reshape(lstm_outputs, [-1, 128]) |
|
|
185 |
|
|
|
186 |
if noisy: |
|
|
187 |
# Apply noisy network on fully connected layers |
|
|
188 |
# ref: https://arxiv.org/abs/1706.10295 |
|
|
189 |
self.policy = tf.clip_by_value(noisy_dense(rnn_out,name=scope, size=a_size, activation_fn=tf.nn.relu),0.0,1.0) |
|
|
190 |
self.value = noisy_dense(rnn_out,name=scope, size=1) # default activation_fn=tf.identity |
|
|
191 |
else: |
|
|
192 |
#Output layers for policy and value estimations |
|
|
193 |
mu = slim.fully_connected(rnn_out,a_size,activation_fn=tf.nn.elu,weights_initializer=normalized_columns_initializer(0.01),biases_initializer=None) |
|
|
194 |
#var = slim.fully_connected(rnn_out,a_size,activation_fn=tf.nn.softplus,weights_initializer=normalized_columns_initializer(0.01),biases_initializer=None) |
|
|
195 |
self.normal_dist = tf.contrib.distributions.Normal(mu, 0.05) |
|
|
196 |
self.policy = tf.clip_by_value(self.normal_dist.sample(1),0.0,1.0) # self.normal_dist.sample(1) |
|
|
197 |
self.value = slim.fully_connected(rnn_out,1, |
|
|
198 |
activation_fn=None, |
|
|
199 |
weights_initializer=normalized_columns_initializer(1.0), |
|
|
200 |
biases_initializer=None) |
|
|
201 |
|
|
|
202 |
#Only the worker network need ops for loss functions and gradient updating. |
|
|
203 |
if scope != 'global': |
|
|
204 |
self.actions = tf.placeholder(shape=[None,a_size],dtype=tf.float32) |
|
|
205 |
self.target_v = tf.placeholder(shape=[None],dtype=tf.float32) |
|
|
206 |
self.advantages = tf.placeholder(shape=[None],dtype=tf.float32) |
|
|
207 |
|
|
|
208 |
#Loss functions |
|
|
209 |
self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1]))) |
|
|
210 |
self.log_prob = tf.reduce_sum(self.normal_dist.log_prob(self.actions),axis=1) |
|
|
211 |
self.entropy = tf.reduce_sum(self.normal_dist.entropy(),axis=1) # encourage exploration |
|
|
212 |
self.entropy = tf.reduce_sum(self.entropy,axis=0) |
|
|
213 |
self.policy_loss = -tf.reduce_sum(self.log_prob*self.advantages,axis=0) |
|
|
214 |
if noisy: |
|
|
215 |
self.loss = 0.5 * self.value_loss + self.policy_loss |
|
|
216 |
else: |
|
|
217 |
self.loss = 0.5 * self.value_loss + self.policy_loss #- 0.01 * self.entropy |
|
|
218 |
|
|
|
219 |
#Get gradients from local network using local losses |
|
|
220 |
local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) |
|
|
221 |
self.gradients = tf.gradients(self.loss,local_vars) |
|
|
222 |
self.var_norms = tf.global_norm(local_vars) |
|
|
223 |
grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0) |
|
|
224 |
|
|
|
225 |
#Apply local gradients to global network |
|
|
226 |
#Comment these two lines out to stop training |
|
|
227 |
global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') |
|
|
228 |
self.apply_grads = trainer.apply_gradients(zip(grads,global_vars)) |
|
|
229 |
|
|
|
230 |
# Learning to run Worker------------------------------------------------------------------------------------------------------------ |
|
|
231 |
class Worker(): |
|
|
232 |
def __init__(self,name,s_size,a_size,trainer,model_path,global_episodes,noisy,is_training): |
|
|
233 |
self.name = "worker_" + str(name) |
|
|
234 |
self.number = name |
|
|
235 |
self.model_path = model_path |
|
|
236 |
self.trainer = trainer |
|
|
237 |
self.global_episodes = global_episodes |
|
|
238 |
self.increment = self.global_episodes.assign_add(1) |
|
|
239 |
self.episode_rewards = [] |
|
|
240 |
self.episode_lengths = [] |
|
|
241 |
self.episode_mean_values = [] |
|
|
242 |
self.summary_writer = tf.summary.FileWriter("train_"+str(self.number)) |
|
|
243 |
self.noisy = noisy |
|
|
244 |
self.is_training = is_training |
|
|
245 |
|
|
|
246 |
#Create the local copy of the network and the tensorflow op to copy global paramters to local network |
|
|
247 |
self.local_AC = AC_Network(s_size,a_size,self.name,trainer,noisy) |
|
|
248 |
#self.local_AC_target = AC_Network(s_size,a_size,self.name+'/target',trainer,noisy) |
|
|
249 |
self.update_local_ops = update_target_graph('global',self.name) |
|
|
250 |
#self.update_local_ops_target = update_target_graph('global/target',self.name+'/target') |
|
|
251 |
#self.update_global_target = update_target_network(self.name,'global/target') |
|
|
252 |
|
|
|
253 |
# Initialize a random process the Ornstein-Uhlenbeck process for action exploration |
|
|
254 |
self.exploration_noise = OUNoise(a_size) |
|
|
255 |
|
|
|
256 |
def train(self,rollout,sess,gamma,bootstrap_value): |
|
|
257 |
rollout = np.array(rollout) |
|
|
258 |
observations = rollout[:,0] |
|
|
259 |
actions = rollout[:,1] |
|
|
260 |
rewards = rollout[:,2] |
|
|
261 |
# reward clipping: scale and clip the values of the rewards to the range -1,+1 |
|
|
262 |
#rewards = (rewards - np.mean(rewards)) / np.max(abs(rewards)) |
|
|
263 |
|
|
|
264 |
next_observations = rollout[:,3] # Aug 1st, notice next observation is never used |
|
|
265 |
values = rollout[:,5] |
|
|
266 |
|
|
|
267 |
# Here we take the rewards and values from the rollout, and use them to |
|
|
268 |
# generate the advantage and discounted returns. |
|
|
269 |
# The advantage function uses "Generalized Advantage Estimation" |
|
|
270 |
self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) |
|
|
271 |
discounted_rewards = discount(self.rewards_plus,gamma)[:-1] |
|
|
272 |
self.value_plus = np.asarray(values.tolist() + [bootstrap_value]) |
|
|
273 |
advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1] |
|
|
274 |
advantages = discount(advantages,gamma) |
|
|
275 |
|
|
|
276 |
# Update the global network using gradients from loss |
|
|
277 |
# Generate network statistics to periodically save |
|
|
278 |
rnn_state = self.local_AC.state_init |
|
|
279 |
feed_dict = {self.local_AC.target_v:discounted_rewards, |
|
|
280 |
self.local_AC.inputs:np.vstack(observations), |
|
|
281 |
self.local_AC.actions:np.vstack(actions), |
|
|
282 |
self.local_AC.advantages:advantages, |
|
|
283 |
self.local_AC.state_in[0]:rnn_state[0], |
|
|
284 |
self.local_AC.state_in[1]:rnn_state[1]} |
|
|
285 |
l,v_l,p_l,e_l,g_n,v_n,_ = sess.run([self.local_AC.loss,self.local_AC.value_loss, |
|
|
286 |
self.local_AC.policy_loss, |
|
|
287 |
self.local_AC.entropy, |
|
|
288 |
self.local_AC.grad_norms, |
|
|
289 |
self.local_AC.var_norms, |
|
|
290 |
self.local_AC.apply_grads], |
|
|
291 |
feed_dict=feed_dict) |
|
|
292 |
return l / len(rollout), v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n |
|
|
293 |
|
|
|
294 |
def work(self,max_episode_length,gamma,sess,coord,saver): |
|
|
295 |
if self.is_training: |
|
|
296 |
episode_count = sess.run(self.global_episodes) |
|
|
297 |
else: |
|
|
298 |
episode_count = 0 |
|
|
299 |
wining_episode_count = 0 |
|
|
300 |
total_steps = 0 |
|
|
301 |
print ("Starting worker " + str(self.number)) |
|
|
302 |
with open('result.txt','w') as f: |
|
|
303 |
f.write(strftime("Starting time: %a, %d %b %Y %H:%M:%S\n", gmtime())) |
|
|
304 |
|
|
|
305 |
explore = 1000 |
|
|
306 |
|
|
|
307 |
if self.name == 'worker_1': |
|
|
308 |
self.env = ei(vis=False)#RunEnv(visualize=True) |
|
|
309 |
else: |
|
|
310 |
self.env = ei(vis=False)#RunEnv(visualize=False) |
|
|
311 |
|
|
|
312 |
with sess.as_default(), sess.graph.as_default(): |
|
|
313 |
#not_start_training_yet = True |
|
|
314 |
while not coord.should_stop(): |
|
|
315 |
# start the env (in the thread) every 50 eps to prevent memory leak |
|
|
316 |
if episode_count % 50 == 0: |
|
|
317 |
if self.env != None: |
|
|
318 |
del self.env |
|
|
319 |
if self.name == 'worker_1': |
|
|
320 |
self.env = ei(vis=True)#RunEnv(visualize=True) |
|
|
321 |
else: |
|
|
322 |
self.env = ei(vis=False)#RunEnv(visualize=False) |
|
|
323 |
self.setting=2 |
|
|
324 |
|
|
|
325 |
sess.run(self.update_local_ops) |
|
|
326 |
#sess.run(self.update_local_ops_target) |
|
|
327 |
episode_buffer = [] |
|
|
328 |
episode_values = [] |
|
|
329 |
episode_reward = 0 |
|
|
330 |
episode_step_count = 0 |
|
|
331 |
done = False |
|
|
332 |
|
|
|
333 |
seed = np.random.rand() |
|
|
334 |
|
|
|
335 |
self.env.reset() |
|
|
336 |
# engineered initial input to make agent's life easier |
|
|
337 |
a=engineered_action(seed) |
|
|
338 |
ob = self.env.step(a)[0] |
|
|
339 |
s = ob |
|
|
340 |
ob = self.env.step(a)[0] |
|
|
341 |
s1 = ob |
|
|
342 |
s = process_state(s,s1) |
|
|
343 |
rnn_state = self.local_AC.state_init |
|
|
344 |
explore -= 1 |
|
|
345 |
#st = time() |
|
|
346 |
chese=0 |
|
|
347 |
while done == False: |
|
|
348 |
#Take an action using probabilities from policy network output. |
|
|
349 |
action,v,rnn_state = sess.run([self.local_AC.policy,self.local_AC.value,self.local_AC.state_out], |
|
|
350 |
feed_dict={self.local_AC.inputs:[s], |
|
|
351 |
self.local_AC.state_in[0]:rnn_state[0], |
|
|
352 |
self.local_AC.state_in[1]:rnn_state[1]}) |
|
|
353 |
if not (episode_count % 5 == 0 and self.name == 'worker_1') and self.is_training: |
|
|
354 |
if explore > 0: # > 0 turn on OU_noise # test the agent every 2 eps |
|
|
355 |
a = np.clip(action[0,0]+self.exploration_noise.noise(),0.0,1.0) |
|
|
356 |
else: |
|
|
357 |
a = action[0,0] |
|
|
358 |
if chese < 60 and episode_count < 250: |
|
|
359 |
a=engineered_action(seed) |
|
|
360 |
chese += 1 |
|
|
361 |
else: |
|
|
362 |
a = action[0,0] |
|
|
363 |
ob,r,done,_ = self.env.step(a) |
|
|
364 |
''' |
|
|
365 |
if self.name == 'worker_0': |
|
|
366 |
ct = time() |
|
|
367 |
print(ct-st) |
|
|
368 |
st = ct |
|
|
369 |
''' |
|
|
370 |
if done == False: |
|
|
371 |
s2 = ob |
|
|
372 |
else: |
|
|
373 |
s2 = s1 |
|
|
374 |
s1 = process_state(s1,s2) |
|
|
375 |
#print(s1) |
|
|
376 |
episode_buffer.append([s,a,r,s1,done,v[0,0]]) |
|
|
377 |
episode_values.append(v[0,0]) |
|
|
378 |
|
|
|
379 |
episode_reward += r |
|
|
380 |
s = s1 |
|
|
381 |
s1 = s2 |
|
|
382 |
total_steps += 1 |
|
|
383 |
episode_step_count += 1 |
|
|
384 |
|
|
|
385 |
# If the episode hasn't ended, but the experience buffer is full, then we |
|
|
386 |
# make an update step using that experience rollout. |
|
|
387 |
''' |
|
|
388 |
if len(episode_buffer) == 120 and done != True and episode_step_count != max_episode_length - 1: # change pisode length to 5, and try to modify Worker.train() function to utilize the next frame to train imagined frame. |
|
|
389 |
# Since we don't know what the true final return is, we "bootstrap" from our current |
|
|
390 |
# value estimation. |
|
|
391 |
if self.is_training: |
|
|
392 |
v1 = sess.run(self.local_AC.value, |
|
|
393 |
feed_dict={self.local_AC.inputs:[s], |
|
|
394 |
self.local_AC.state_in[0]:rnn_state[0], |
|
|
395 |
self.local_AC.state_in[1]:rnn_state[1]})[0,0] |
|
|
396 |
l,v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,v1) |
|
|
397 |
sess.run(self.update_local_ops) |
|
|
398 |
episode_buffer = [] |
|
|
399 |
''' |
|
|
400 |
if done == True: |
|
|
401 |
break |
|
|
402 |
|
|
|
403 |
self.episode_rewards.append(episode_reward) |
|
|
404 |
self.episode_lengths.append(episode_step_count) |
|
|
405 |
self.episode_mean_values.append(np.mean(episode_values)) |
|
|
406 |
|
|
|
407 |
# Update the network using the experience buffer at the end of the episode. |
|
|
408 |
if len(episode_buffer) != 0: |
|
|
409 |
if self.is_training: |
|
|
410 |
l,v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0) |
|
|
411 |
#print(l,v_l,p_l,e_l,g_n,v_n) |
|
|
412 |
#sess.run(self.update_global_target) |
|
|
413 |
|
|
|
414 |
|
|
|
415 |
# Periodically save gifs of episodes, model parameters, and summary statistics. |
|
|
416 |
if episode_count % 5 == 0 and episode_count != 0: |
|
|
417 |
mean_reward = np.mean(self.episode_rewards[-5:]) |
|
|
418 |
mean_length = np.mean(self.episode_lengths[-5:]) |
|
|
419 |
mean_value = np.mean(self.episode_mean_values[-5:]) |
|
|
420 |
summary = tf.Summary() |
|
|
421 |
summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward)) |
|
|
422 |
summary.value.add(tag='Perf/Length', simple_value=float(mean_length)) |
|
|
423 |
summary.value.add(tag='Perf/Value', simple_value=float(mean_value)) |
|
|
424 |
if self.is_training: |
|
|
425 |
summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l)) |
|
|
426 |
summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l)) |
|
|
427 |
summary.value.add(tag='Losses/Entropy', simple_value=float(e_l)) |
|
|
428 |
self.summary_writer.add_summary(summary, episode_count) |
|
|
429 |
self.summary_writer.flush() |
|
|
430 |
if self.name == 'worker_1': |
|
|
431 |
with open('result.txt','a') as f: |
|
|
432 |
f.write("Episode "+str(episode_count)+" reward (testing): %.2f\n" % episode_reward) |
|
|
433 |
if self.name == 'worker_0': |
|
|
434 |
with open('result.txt','a') as f: |
|
|
435 |
f.write("Episodes "+str(episode_count)+" mean reward (training): %.2f\n" % mean_reward) |
|
|
436 |
|
|
|
437 |
if episode_count % 100 == 0: |
|
|
438 |
saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk') |
|
|
439 |
with open('result.txt','a') as f: |
|
|
440 |
f.write("Saved Model at episode: "+str(episode_count)+"\n") |
|
|
441 |
if self.name == 'worker_0' and self.is_training: |
|
|
442 |
sess.run(self.increment) |
|
|
443 |
|
|
|
444 |
episode_count += 1 |
|
|
445 |
|
|
|
446 |
if self.name == "worker_1" and episode_reward > 2.: |
|
|
447 |
wining_episode_count += 1 |
|
|
448 |
print('Worker_1 is stepping forward in Episode {}! Reward: {:.2f}. Total percentage of success is: {}%'.format(episode_count, episode_reward, int(wining_episode_count / episode_count * 100))) |
|
|
449 |
with open('result.txt','a') as f: |
|
|
450 |
f.wirte('Worker_1 is stepping forward in Episode {}! Reward: {:.2f}. Total percentage of success is: {}%\n'.format(episode_count, episode_reward, int(wining_episode_count / episode_count * 100))) |
|
|
451 |
|
|
|
452 |
# All done Stop trail |
|
|
453 |
# Confirm exit |
|
|
454 |
print('Exit/Done '+self.name) |