--- a +++ b/A3C/model.py @@ -0,0 +1,355 @@ +from ou_noise import OUNoise + +from helper import * + +import opensim as osim +from osim.http.client import Client +from osim.env import * + +import threading +import numpy as np +import tensorflow as tf +import tensorflow.contrib.slim as slim + +from helper import * + +from time import sleep +from time import time +from time import gmtime, strftime +import multiprocessing +from multiprocessing import Process, Pipe +from osim.env import * + +# [Hacked] the memory might always be leaking, here's a solution #58 +# https://github.com/stanfordnmbl/osim-rl/issues/58 +# separate process that holds a separate RunEnv instance. +# This has to be done since RunEnv() in the same process result in interleaved running of simulations. +def floatify(np): + return [float(np[i]) for i in range(len(np))] + +def standalone_headless_isolated(conn,vis): + e = RunEnv(visualize=vis) + + while True: + try: + msg = conn.recv() + + # messages should be tuples, + # msg[0] should be string + + if msg[0] == 'reset': + o = e.reset(difficulty=2) + conn.send(o) + elif msg[0] == 'step': + ordi = e.step(msg[1]) + conn.send(ordi) + else: + conn.close() + del e + return + except: + conn.close() + del e + raise + +# class that manages the interprocess communication and expose itself as a RunEnv. +class ei: # Environment Instance + def __init__(self,vis): + self.pc, self.cc = Pipe() + self.p = Process( + target = standalone_headless_isolated, + args=(self.cc,vis,) + ) + self.p.daemon = True + self.p.start() + + def reset(self): + self.pc.send(('reset',)) + return self.pc.recv() + + def step(self,actions): + self.pc.send(('step',actions,)) + try: + return self.pc.recv() + except EOFError: + return None + + def __del__(self): + self.pc.send(('exit',)) + #print('(ei)waiting for join...') + self.p.join() + + +# ================================================================ +# Model components +# ================================================================ + +# Actor Network------------------------------------------------------------------------------------------------------------ +class AC_Network(): + def __init__(self,s_size,a_size,scope,trainer): + with tf.variable_scope(scope): + #Input and visual encoding layers + self.inputs = tf.placeholder(shape=[None,s_size],dtype=tf.float32) + + # Create the model, use the default arg scope to configure the batch norm parameters. + ''' + conv1 = tf.nn.elu(tf.nn.conv1d(self.imageIn,tf.truncated_normal([2,1,8],stddev=0.1),2,padding='VALID')) + conv2 = tf.nn.elu(tf.nn.conv1d(conv1,tf.truncated_normal([3,8,16],stddev=0.05),1,padding='VALID')) + + hidden = slim.fully_connected(slim.flatten(conv2),200,activation_fn=tf.nn.elu)''' + + hidden1 = slim.fully_connected(self.inputs,300,activation_fn=tf.elu) + hidden2 = slim.fully_connected(hidden1,200,activation_fn=tf.elu) + + #Output layers for policy and value estimations + mu = slim.fully_connected(hidden2,a_size,activation_fn=tf.sigmoid,weights_initializer=normalized_columns_initializer(0.01),biases_initializer=None) + var = slim.fully_connected(hidden2,a_size,activation_fn=tf.nn.softplus,weights_initializer=normalized_columns_initializer(0.01),biases_initializer=None) + self.normal_dist = tf.contrib.distributions.Normal(mu, 0.05) + self.policy = tf.clip_by_value(self.normal_dist.sample(1),0.0,1.0) # self.normal_dist.sample(1) + self.value = slim.fully_connected(hidden2,1, + activation_fn=None, + weights_initializer=normalized_columns_initializer(1.0), + biases_initializer=None) + + #Only the worker network need ops for loss functions and gradient updating. + if scope != 'global': + self.actions = tf.placeholder(shape=[None,a_size],dtype=tf.float32) + self.target_v = tf.placeholder(shape=[None],dtype=tf.float32) + self.advantages = tf.placeholder(shape=[None],dtype=tf.float32) + + #Loss functions + self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1]))) + self.log_prob = tf.reduce_sum(self.normal_dist.log_prob(self.actions),axis=1) + self.entropy = tf.reduce_sum(self.normal_dist.entropy(),axis=1) # encourage exploration + self.entropy = tf.reduce_sum(self.entropy,axis=0) + self.policy_loss = -tf.reduce_sum(self.log_prob*self.advantages,axis=0) + + self.loss = 0.5 * self.value_loss + self.policy_loss - 0.01 * self.entropy + + #Get gradients from local network using local losses + local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) + self.gradients = tf.gradients(self.loss,local_vars) + self.var_norms = tf.global_norm(local_vars) + grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0) + + #Apply local gradients to global network + #Comment these two lines out to stop training + global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') + self.apply_grads = trainer.apply_gradients(zip(grads,global_vars)) + +# Learning to run Worker------------------------------------------------------------------------------------------------------------ +class Worker(): + def __init__(self,name,s_size,a_size,trainer,model_path,global_episodes,is_training): + self.name = "worker_" + str(name) + self.number = name + self.model_path = model_path + self.trainer = trainer + self.global_episodes = global_episodes + self.increment = self.global_episodes.assign_add(1) + self.episode_rewards = [] + self.episode_lengths = [] + self.episode_mean_values = [] + self.summary_writer = tf.summary.FileWriter("train_"+str(self.number)) + self.is_training = is_training + + #Create the local copy of the network and the tensorflow op to copy global paramters to local network + self.local_AC = AC_Network(s_size,a_size,self.name,trainer) + #self.local_AC_target = AC_Network(s_size,a_size,self.name+'/target',trainer,noisy) + self.update_local_ops = update_target_graph('global',self.name) + #self.update_local_ops_target = update_target_graph('global/target',self.name+'/target') + #self.update_global_target = update_target_network(self.name,'global/target') + + # Initialize a random process the Ornstein-Uhlenbeck process for action exploration + self.exploration_noise = OUNoise(a_size) + + def train(self,rollout,sess,gamma,bootstrap_value): + rollout = np.array(rollout) + observations = rollout[:,0] + actions = rollout[:,1] + rewards = rollout[:,2] + # reward clipping: scale and clip the values of the rewards to the range -1,+1 + #rewards = (rewards - np.mean(rewards)) / np.max(abs(rewards)) + + next_observations = rollout[:,3] # Aug 1st, notice next observation is never used + values = rollout[:,5] + + # Here we take the rewards and values from the rollout, and use them to + # generate the advantage and discounted returns. + # The advantage function uses "Generalized Advantage Estimation" + self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) + discounted_rewards = discount(self.rewards_plus,gamma)[:-1] + self.value_plus = np.asarray(values.tolist() + [bootstrap_value]) + advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1] + advantages = discount(advantages,gamma) + + # Update the global network using gradients from loss + # Generate network statistics to periodically save + feed_dict = {self.local_AC.target_v:discounted_rewards, + self.local_AC.inputs:np.vstack(observations), + self.local_AC.actions:np.vstack(actions), + self.local_AC.advantages:advantages} + l,v_l,p_l,e_l,g_n,v_n,_ = sess.run([self.local_AC.loss,self.local_AC.value_loss, + self.local_AC.policy_loss, + self.local_AC.entropy, + self.local_AC.grad_norms, + self.local_AC.var_norms, + self.local_AC.apply_grads], + feed_dict=feed_dict) + return l / len(rollout), v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n + + def work(self,max_episode_length,gamma,sess,coord,saver): + if self.is_training: + episode_count = sess.run(self.global_episodes) + else: + episode_count = 0 + wining_episode_count = 0 + total_steps = 0 + print ("Starting worker " + str(self.number)) + with open('result.txt','w') as f: + f.write(strftime("Starting time: %a, %d %b %Y %H:%M:%S\n", gmtime())) + + explore = 1000 + + if self.name == 'worker_1': + self.env = ei(vis=False)#RunEnv(visualize=True) + else: + self.env = ei(vis=False)#RunEnv(visualize=False) + + with sess.as_default(), sess.graph.as_default(): + #not_start_training_yet = True + while not coord.should_stop(): + # start the env (in the thread) every 50 eps to prevent memory leak + if episode_count % 50 == 0: + if self.env != None: + del self.env + if self.name == 'worker_1': + self.env = ei(vis=True)#RunEnv(visualize=True) + else: + self.env = ei(vis=False)#RunEnv(visualize=False) + self.setting=2 + + sess.run(self.update_local_ops) + #sess.run(self.update_local_ops_target) + episode_buffer = [] + episode_values = [] + episode_reward = 0 + episode_step_count = 0 + done = False + + seed = np.random.rand() + + self.env.reset() + # engineered initial input to make agent's life easier + a=engineered_action(seed) + ob = self.env.step(a)[0] + s = ob + ob = self.env.step(a)[0] + s1 = ob + s = process_state(s,s1) + explore -= 1 + #st = time() + chese=0 + while done == False: + #Take an action using probabilities from policy network output. + action,v = sess.run([self.local_AC.policy,self.local_AC.value,self.local_AC.state_out], + feed_dict={self.local_AC.inputs:[s]) + if not (episode_count % 5 == 0 and self.name == 'worker_1') and self.is_training: + if explore > 0: # > 0 turn on OU_noise # test the agent every 2 eps + a = np.clip(action[0,0]+self.exploration_noise.noise(),0.0,1.0) + else: + a = action[0,0] + if chese < 60 and episode_count < 250: + a=engineered_action(seed) + chese += 1 + else: + a = action[0,0] + ob,r,done,_ = self.env.step(a) + ''' + if self.name == 'worker_0': + ct = time() + print(ct-st) + st = ct + ''' + if done == False: + s2 = ob + else: + s2 = s1 + s1 = process_state(s1,s2) + #print(s1) + episode_buffer.append([s,a,r,s1,done,v[0,0]]) + episode_values.append(v[0,0]) + + episode_reward += r + s = s1 + s1 = s2 + total_steps += 1 + episode_step_count += 1 + + # If the episode hasn't ended, but the experience buffer is full, then we + # make an update step using that experience rollout. + ''' + if len(episode_buffer) == 120 and done != True and episode_step_count != max_episode_length - 1: # change pisode length to 5, and try to modify Worker.train() function to utilize the next frame to train imagined frame. + # Since we don't know what the true final return is, we "bootstrap" from our current + # value estimation. + if self.is_training: + v1 = sess.run(self.local_AC.value, + feed_dict={self.local_AC.inputs:[s]})[0,0] + l,v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,v1) + sess.run(self.update_local_ops) + episode_buffer = [] + ''' + if done == True: + break + + self.episode_rewards.append(episode_reward) + self.episode_lengths.append(episode_step_count) + self.episode_mean_values.append(np.mean(episode_values)) + + # Update the network using the experience buffer at the end of the episode. + if len(episode_buffer) != 0: + if self.is_training: + l,v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0) + #print(l,v_l,p_l,e_l,g_n,v_n) + #sess.run(self.update_global_target) + + + # Periodically save gifs of episodes, model parameters, and summary statistics. + if episode_count % 5 == 0 and episode_count != 0: + mean_reward = np.mean(self.episode_rewards[-5:]) + mean_length = np.mean(self.episode_lengths[-5:]) + mean_value = np.mean(self.episode_mean_values[-5:]) + summary = tf.Summary() + summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward)) + summary.value.add(tag='Perf/Length', simple_value=float(mean_length)) + summary.value.add(tag='Perf/Value', simple_value=float(mean_value)) + if self.is_training: + summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l)) + summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l)) + summary.value.add(tag='Losses/Entropy', simple_value=float(e_l)) + self.summary_writer.add_summary(summary, episode_count) + self.summary_writer.flush() + if self.name == 'worker_1': + with open('result.txt','a') as f: + f.write("Episode "+str(episode_count)+" reward (testing): %.2f\n" % episode_reward) + if self.name == 'worker_0': + with open('result.txt','a') as f: + f.write("Episodes "+str(episode_count)+" mean reward (training): %.2f\n" % mean_reward) + + if episode_count % 100 == 0: + saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk') + with open('result.txt','a') as f: + f.write("Saved Model at episode: "+str(episode_count)+"\n") + if self.name == 'worker_0' and self.is_training: + sess.run(self.increment) + + episode_count += 1 + + if self.name == "worker_1" and episode_reward > 2.: + wining_episode_count += 1 + print('Worker_1 is stepping forward in Episode {}! Reward: {:.2f}. Total percentage of success is: {}%'.format(episode_count, episode_reward, int(wining_episode_count / episode_count * 100))) + with open('result.txt','a') as f: + f.wirte('Worker_1 is stepping forward in Episode {}! Reward: {:.2f}. Total percentage of success is: {}%\n'.format(episode_count, episode_reward, int(wining_episode_count / episode_count * 100))) + + # All done Stop trail + # Confirm exit + print('Exit/Done '+self.name)