|
a |
|
b/A3C/model.py |
|
|
1 |
from ou_noise import OUNoise |
|
|
2 |
|
|
|
3 |
from helper import * |
|
|
4 |
|
|
|
5 |
import opensim as osim |
|
|
6 |
from osim.http.client import Client |
|
|
7 |
from osim.env import * |
|
|
8 |
|
|
|
9 |
import threading |
|
|
10 |
import numpy as np |
|
|
11 |
import tensorflow as tf |
|
|
12 |
import tensorflow.contrib.slim as slim |
|
|
13 |
|
|
|
14 |
from helper import * |
|
|
15 |
|
|
|
16 |
from time import sleep |
|
|
17 |
from time import time |
|
|
18 |
from time import gmtime, strftime |
|
|
19 |
import multiprocessing |
|
|
20 |
from multiprocessing import Process, Pipe |
|
|
21 |
from osim.env import * |
|
|
22 |
|
|
|
23 |
# [Hacked] the memory might always be leaking, here's a solution #58 |
|
|
24 |
# https://github.com/stanfordnmbl/osim-rl/issues/58 |
|
|
25 |
# separate process that holds a separate RunEnv instance. |
|
|
26 |
# This has to be done since RunEnv() in the same process result in interleaved running of simulations. |
|
|
27 |
def floatify(np): |
|
|
28 |
return [float(np[i]) for i in range(len(np))] |
|
|
29 |
|
|
|
30 |
def standalone_headless_isolated(conn,vis): |
|
|
31 |
e = RunEnv(visualize=vis) |
|
|
32 |
|
|
|
33 |
while True: |
|
|
34 |
try: |
|
|
35 |
msg = conn.recv() |
|
|
36 |
|
|
|
37 |
# messages should be tuples, |
|
|
38 |
# msg[0] should be string |
|
|
39 |
|
|
|
40 |
if msg[0] == 'reset': |
|
|
41 |
o = e.reset(difficulty=2) |
|
|
42 |
conn.send(o) |
|
|
43 |
elif msg[0] == 'step': |
|
|
44 |
ordi = e.step(msg[1]) |
|
|
45 |
conn.send(ordi) |
|
|
46 |
else: |
|
|
47 |
conn.close() |
|
|
48 |
del e |
|
|
49 |
return |
|
|
50 |
except: |
|
|
51 |
conn.close() |
|
|
52 |
del e |
|
|
53 |
raise |
|
|
54 |
|
|
|
55 |
# class that manages the interprocess communication and expose itself as a RunEnv. |
|
|
56 |
class ei: # Environment Instance |
|
|
57 |
def __init__(self,vis): |
|
|
58 |
self.pc, self.cc = Pipe() |
|
|
59 |
self.p = Process( |
|
|
60 |
target = standalone_headless_isolated, |
|
|
61 |
args=(self.cc,vis,) |
|
|
62 |
) |
|
|
63 |
self.p.daemon = True |
|
|
64 |
self.p.start() |
|
|
65 |
|
|
|
66 |
def reset(self): |
|
|
67 |
self.pc.send(('reset',)) |
|
|
68 |
return self.pc.recv() |
|
|
69 |
|
|
|
70 |
def step(self,actions): |
|
|
71 |
self.pc.send(('step',actions,)) |
|
|
72 |
try: |
|
|
73 |
return self.pc.recv() |
|
|
74 |
except EOFError: |
|
|
75 |
return None |
|
|
76 |
|
|
|
77 |
def __del__(self): |
|
|
78 |
self.pc.send(('exit',)) |
|
|
79 |
#print('(ei)waiting for join...') |
|
|
80 |
self.p.join() |
|
|
81 |
|
|
|
82 |
|
|
|
83 |
# ================================================================ |
|
|
84 |
# Model components |
|
|
85 |
# ================================================================ |
|
|
86 |
|
|
|
87 |
# Actor Network------------------------------------------------------------------------------------------------------------ |
|
|
88 |
class AC_Network(): |
|
|
89 |
def __init__(self,s_size,a_size,scope,trainer): |
|
|
90 |
with tf.variable_scope(scope): |
|
|
91 |
#Input and visual encoding layers |
|
|
92 |
self.inputs = tf.placeholder(shape=[None,s_size],dtype=tf.float32) |
|
|
93 |
|
|
|
94 |
# Create the model, use the default arg scope to configure the batch norm parameters. |
|
|
95 |
''' |
|
|
96 |
conv1 = tf.nn.elu(tf.nn.conv1d(self.imageIn,tf.truncated_normal([2,1,8],stddev=0.1),2,padding='VALID')) |
|
|
97 |
conv2 = tf.nn.elu(tf.nn.conv1d(conv1,tf.truncated_normal([3,8,16],stddev=0.05),1,padding='VALID')) |
|
|
98 |
|
|
|
99 |
hidden = slim.fully_connected(slim.flatten(conv2),200,activation_fn=tf.nn.elu)''' |
|
|
100 |
|
|
|
101 |
hidden1 = slim.fully_connected(self.inputs,300,activation_fn=tf.elu) |
|
|
102 |
hidden2 = slim.fully_connected(hidden1,200,activation_fn=tf.elu) |
|
|
103 |
|
|
|
104 |
#Output layers for policy and value estimations |
|
|
105 |
mu = slim.fully_connected(hidden2,a_size,activation_fn=tf.sigmoid,weights_initializer=normalized_columns_initializer(0.01),biases_initializer=None) |
|
|
106 |
var = slim.fully_connected(hidden2,a_size,activation_fn=tf.nn.softplus,weights_initializer=normalized_columns_initializer(0.01),biases_initializer=None) |
|
|
107 |
self.normal_dist = tf.contrib.distributions.Normal(mu, 0.05) |
|
|
108 |
self.policy = tf.clip_by_value(self.normal_dist.sample(1),0.0,1.0) # self.normal_dist.sample(1) |
|
|
109 |
self.value = slim.fully_connected(hidden2,1, |
|
|
110 |
activation_fn=None, |
|
|
111 |
weights_initializer=normalized_columns_initializer(1.0), |
|
|
112 |
biases_initializer=None) |
|
|
113 |
|
|
|
114 |
#Only the worker network need ops for loss functions and gradient updating. |
|
|
115 |
if scope != 'global': |
|
|
116 |
self.actions = tf.placeholder(shape=[None,a_size],dtype=tf.float32) |
|
|
117 |
self.target_v = tf.placeholder(shape=[None],dtype=tf.float32) |
|
|
118 |
self.advantages = tf.placeholder(shape=[None],dtype=tf.float32) |
|
|
119 |
|
|
|
120 |
#Loss functions |
|
|
121 |
self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1]))) |
|
|
122 |
self.log_prob = tf.reduce_sum(self.normal_dist.log_prob(self.actions),axis=1) |
|
|
123 |
self.entropy = tf.reduce_sum(self.normal_dist.entropy(),axis=1) # encourage exploration |
|
|
124 |
self.entropy = tf.reduce_sum(self.entropy,axis=0) |
|
|
125 |
self.policy_loss = -tf.reduce_sum(self.log_prob*self.advantages,axis=0) |
|
|
126 |
|
|
|
127 |
self.loss = 0.5 * self.value_loss + self.policy_loss - 0.01 * self.entropy |
|
|
128 |
|
|
|
129 |
#Get gradients from local network using local losses |
|
|
130 |
local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) |
|
|
131 |
self.gradients = tf.gradients(self.loss,local_vars) |
|
|
132 |
self.var_norms = tf.global_norm(local_vars) |
|
|
133 |
grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0) |
|
|
134 |
|
|
|
135 |
#Apply local gradients to global network |
|
|
136 |
#Comment these two lines out to stop training |
|
|
137 |
global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') |
|
|
138 |
self.apply_grads = trainer.apply_gradients(zip(grads,global_vars)) |
|
|
139 |
|
|
|
140 |
# Learning to run Worker------------------------------------------------------------------------------------------------------------ |
|
|
141 |
class Worker(): |
|
|
142 |
def __init__(self,name,s_size,a_size,trainer,model_path,global_episodes,is_training): |
|
|
143 |
self.name = "worker_" + str(name) |
|
|
144 |
self.number = name |
|
|
145 |
self.model_path = model_path |
|
|
146 |
self.trainer = trainer |
|
|
147 |
self.global_episodes = global_episodes |
|
|
148 |
self.increment = self.global_episodes.assign_add(1) |
|
|
149 |
self.episode_rewards = [] |
|
|
150 |
self.episode_lengths = [] |
|
|
151 |
self.episode_mean_values = [] |
|
|
152 |
self.summary_writer = tf.summary.FileWriter("train_"+str(self.number)) |
|
|
153 |
self.is_training = is_training |
|
|
154 |
|
|
|
155 |
#Create the local copy of the network and the tensorflow op to copy global paramters to local network |
|
|
156 |
self.local_AC = AC_Network(s_size,a_size,self.name,trainer) |
|
|
157 |
#self.local_AC_target = AC_Network(s_size,a_size,self.name+'/target',trainer,noisy) |
|
|
158 |
self.update_local_ops = update_target_graph('global',self.name) |
|
|
159 |
#self.update_local_ops_target = update_target_graph('global/target',self.name+'/target') |
|
|
160 |
#self.update_global_target = update_target_network(self.name,'global/target') |
|
|
161 |
|
|
|
162 |
# Initialize a random process the Ornstein-Uhlenbeck process for action exploration |
|
|
163 |
self.exploration_noise = OUNoise(a_size) |
|
|
164 |
|
|
|
165 |
def train(self,rollout,sess,gamma,bootstrap_value): |
|
|
166 |
rollout = np.array(rollout) |
|
|
167 |
observations = rollout[:,0] |
|
|
168 |
actions = rollout[:,1] |
|
|
169 |
rewards = rollout[:,2] |
|
|
170 |
# reward clipping: scale and clip the values of the rewards to the range -1,+1 |
|
|
171 |
#rewards = (rewards - np.mean(rewards)) / np.max(abs(rewards)) |
|
|
172 |
|
|
|
173 |
next_observations = rollout[:,3] # Aug 1st, notice next observation is never used |
|
|
174 |
values = rollout[:,5] |
|
|
175 |
|
|
|
176 |
# Here we take the rewards and values from the rollout, and use them to |
|
|
177 |
# generate the advantage and discounted returns. |
|
|
178 |
# The advantage function uses "Generalized Advantage Estimation" |
|
|
179 |
self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) |
|
|
180 |
discounted_rewards = discount(self.rewards_plus,gamma)[:-1] |
|
|
181 |
self.value_plus = np.asarray(values.tolist() + [bootstrap_value]) |
|
|
182 |
advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1] |
|
|
183 |
advantages = discount(advantages,gamma) |
|
|
184 |
|
|
|
185 |
# Update the global network using gradients from loss |
|
|
186 |
# Generate network statistics to periodically save |
|
|
187 |
feed_dict = {self.local_AC.target_v:discounted_rewards, |
|
|
188 |
self.local_AC.inputs:np.vstack(observations), |
|
|
189 |
self.local_AC.actions:np.vstack(actions), |
|
|
190 |
self.local_AC.advantages:advantages} |
|
|
191 |
l,v_l,p_l,e_l,g_n,v_n,_ = sess.run([self.local_AC.loss,self.local_AC.value_loss, |
|
|
192 |
self.local_AC.policy_loss, |
|
|
193 |
self.local_AC.entropy, |
|
|
194 |
self.local_AC.grad_norms, |
|
|
195 |
self.local_AC.var_norms, |
|
|
196 |
self.local_AC.apply_grads], |
|
|
197 |
feed_dict=feed_dict) |
|
|
198 |
return l / len(rollout), v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n |
|
|
199 |
|
|
|
200 |
def work(self,max_episode_length,gamma,sess,coord,saver): |
|
|
201 |
if self.is_training: |
|
|
202 |
episode_count = sess.run(self.global_episodes) |
|
|
203 |
else: |
|
|
204 |
episode_count = 0 |
|
|
205 |
wining_episode_count = 0 |
|
|
206 |
total_steps = 0 |
|
|
207 |
print ("Starting worker " + str(self.number)) |
|
|
208 |
with open('result.txt','w') as f: |
|
|
209 |
f.write(strftime("Starting time: %a, %d %b %Y %H:%M:%S\n", gmtime())) |
|
|
210 |
|
|
|
211 |
explore = 1000 |
|
|
212 |
|
|
|
213 |
if self.name == 'worker_1': |
|
|
214 |
self.env = ei(vis=False)#RunEnv(visualize=True) |
|
|
215 |
else: |
|
|
216 |
self.env = ei(vis=False)#RunEnv(visualize=False) |
|
|
217 |
|
|
|
218 |
with sess.as_default(), sess.graph.as_default(): |
|
|
219 |
#not_start_training_yet = True |
|
|
220 |
while not coord.should_stop(): |
|
|
221 |
# start the env (in the thread) every 50 eps to prevent memory leak |
|
|
222 |
if episode_count % 50 == 0: |
|
|
223 |
if self.env != None: |
|
|
224 |
del self.env |
|
|
225 |
if self.name == 'worker_1': |
|
|
226 |
self.env = ei(vis=True)#RunEnv(visualize=True) |
|
|
227 |
else: |
|
|
228 |
self.env = ei(vis=False)#RunEnv(visualize=False) |
|
|
229 |
self.setting=2 |
|
|
230 |
|
|
|
231 |
sess.run(self.update_local_ops) |
|
|
232 |
#sess.run(self.update_local_ops_target) |
|
|
233 |
episode_buffer = [] |
|
|
234 |
episode_values = [] |
|
|
235 |
episode_reward = 0 |
|
|
236 |
episode_step_count = 0 |
|
|
237 |
done = False |
|
|
238 |
|
|
|
239 |
seed = np.random.rand() |
|
|
240 |
|
|
|
241 |
self.env.reset() |
|
|
242 |
# engineered initial input to make agent's life easier |
|
|
243 |
a=engineered_action(seed) |
|
|
244 |
ob = self.env.step(a)[0] |
|
|
245 |
s = ob |
|
|
246 |
ob = self.env.step(a)[0] |
|
|
247 |
s1 = ob |
|
|
248 |
s = process_state(s,s1) |
|
|
249 |
explore -= 1 |
|
|
250 |
#st = time() |
|
|
251 |
chese=0 |
|
|
252 |
while done == False: |
|
|
253 |
#Take an action using probabilities from policy network output. |
|
|
254 |
action,v = sess.run([self.local_AC.policy,self.local_AC.value,self.local_AC.state_out], |
|
|
255 |
feed_dict={self.local_AC.inputs:[s]) |
|
|
256 |
if not (episode_count % 5 == 0 and self.name == 'worker_1') and self.is_training: |
|
|
257 |
if explore > 0: # > 0 turn on OU_noise # test the agent every 2 eps |
|
|
258 |
a = np.clip(action[0,0]+self.exploration_noise.noise(),0.0,1.0) |
|
|
259 |
else: |
|
|
260 |
a = action[0,0] |
|
|
261 |
if chese < 60 and episode_count < 250: |
|
|
262 |
a=engineered_action(seed) |
|
|
263 |
chese += 1 |
|
|
264 |
else: |
|
|
265 |
a = action[0,0] |
|
|
266 |
ob,r,done,_ = self.env.step(a) |
|
|
267 |
''' |
|
|
268 |
if self.name == 'worker_0': |
|
|
269 |
ct = time() |
|
|
270 |
print(ct-st) |
|
|
271 |
st = ct |
|
|
272 |
''' |
|
|
273 |
if done == False: |
|
|
274 |
s2 = ob |
|
|
275 |
else: |
|
|
276 |
s2 = s1 |
|
|
277 |
s1 = process_state(s1,s2) |
|
|
278 |
#print(s1) |
|
|
279 |
episode_buffer.append([s,a,r,s1,done,v[0,0]]) |
|
|
280 |
episode_values.append(v[0,0]) |
|
|
281 |
|
|
|
282 |
episode_reward += r |
|
|
283 |
s = s1 |
|
|
284 |
s1 = s2 |
|
|
285 |
total_steps += 1 |
|
|
286 |
episode_step_count += 1 |
|
|
287 |
|
|
|
288 |
# If the episode hasn't ended, but the experience buffer is full, then we |
|
|
289 |
# make an update step using that experience rollout. |
|
|
290 |
''' |
|
|
291 |
if len(episode_buffer) == 120 and done != True and episode_step_count != max_episode_length - 1: # change pisode length to 5, and try to modify Worker.train() function to utilize the next frame to train imagined frame. |
|
|
292 |
# Since we don't know what the true final return is, we "bootstrap" from our current |
|
|
293 |
# value estimation. |
|
|
294 |
if self.is_training: |
|
|
295 |
v1 = sess.run(self.local_AC.value, |
|
|
296 |
feed_dict={self.local_AC.inputs:[s]})[0,0] |
|
|
297 |
l,v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,v1) |
|
|
298 |
sess.run(self.update_local_ops) |
|
|
299 |
episode_buffer = [] |
|
|
300 |
''' |
|
|
301 |
if done == True: |
|
|
302 |
break |
|
|
303 |
|
|
|
304 |
self.episode_rewards.append(episode_reward) |
|
|
305 |
self.episode_lengths.append(episode_step_count) |
|
|
306 |
self.episode_mean_values.append(np.mean(episode_values)) |
|
|
307 |
|
|
|
308 |
# Update the network using the experience buffer at the end of the episode. |
|
|
309 |
if len(episode_buffer) != 0: |
|
|
310 |
if self.is_training: |
|
|
311 |
l,v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0) |
|
|
312 |
#print(l,v_l,p_l,e_l,g_n,v_n) |
|
|
313 |
#sess.run(self.update_global_target) |
|
|
314 |
|
|
|
315 |
|
|
|
316 |
# Periodically save gifs of episodes, model parameters, and summary statistics. |
|
|
317 |
if episode_count % 5 == 0 and episode_count != 0: |
|
|
318 |
mean_reward = np.mean(self.episode_rewards[-5:]) |
|
|
319 |
mean_length = np.mean(self.episode_lengths[-5:]) |
|
|
320 |
mean_value = np.mean(self.episode_mean_values[-5:]) |
|
|
321 |
summary = tf.Summary() |
|
|
322 |
summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward)) |
|
|
323 |
summary.value.add(tag='Perf/Length', simple_value=float(mean_length)) |
|
|
324 |
summary.value.add(tag='Perf/Value', simple_value=float(mean_value)) |
|
|
325 |
if self.is_training: |
|
|
326 |
summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l)) |
|
|
327 |
summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l)) |
|
|
328 |
summary.value.add(tag='Losses/Entropy', simple_value=float(e_l)) |
|
|
329 |
self.summary_writer.add_summary(summary, episode_count) |
|
|
330 |
self.summary_writer.flush() |
|
|
331 |
if self.name == 'worker_1': |
|
|
332 |
with open('result.txt','a') as f: |
|
|
333 |
f.write("Episode "+str(episode_count)+" reward (testing): %.2f\n" % episode_reward) |
|
|
334 |
if self.name == 'worker_0': |
|
|
335 |
with open('result.txt','a') as f: |
|
|
336 |
f.write("Episodes "+str(episode_count)+" mean reward (training): %.2f\n" % mean_reward) |
|
|
337 |
|
|
|
338 |
if episode_count % 100 == 0: |
|
|
339 |
saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk') |
|
|
340 |
with open('result.txt','a') as f: |
|
|
341 |
f.write("Saved Model at episode: "+str(episode_count)+"\n") |
|
|
342 |
if self.name == 'worker_0' and self.is_training: |
|
|
343 |
sess.run(self.increment) |
|
|
344 |
|
|
|
345 |
episode_count += 1 |
|
|
346 |
|
|
|
347 |
if self.name == "worker_1" and episode_reward > 2.: |
|
|
348 |
wining_episode_count += 1 |
|
|
349 |
print('Worker_1 is stepping forward in Episode {}! Reward: {:.2f}. Total percentage of success is: {}%'.format(episode_count, episode_reward, int(wining_episode_count / episode_count * 100))) |
|
|
350 |
with open('result.txt','a') as f: |
|
|
351 |
f.wirte('Worker_1 is stepping forward in Episode {}! Reward: {:.2f}. Total percentage of success is: {}%\n'.format(episode_count, episode_reward, int(wining_episode_count / episode_count * 100))) |
|
|
352 |
|
|
|
353 |
# All done Stop trail |
|
|
354 |
# Confirm exit |
|
|
355 |
print('Exit/Done '+self.name) |