Diff of /A3C/model (lstm).py [000000] .. [687a25]

Switch to unified view

a b/A3C/model (lstm).py
1
from ou_noise import OUNoise
2
3
from helper import *
4
5
import opensim as osim
6
from osim.http.client import Client
7
from osim.env import *
8
9
import threading
10
import numpy as np
11
import tensorflow as tf
12
import tensorflow.contrib.slim as slim
13
14
from helper import *
15
16
from time import sleep
17
from time import time
18
from time import gmtime, strftime
19
import multiprocessing
20
from multiprocessing import Process, Pipe
21
from osim.env import *
22
23
# [Hacked] the memory might always be leaking, here's a solution #58
24
# https://github.com/stanfordnmbl/osim-rl/issues/58 
25
# separate process that holds a separate RunEnv instance.
26
# This has to be done since RunEnv() in the same process result in interleaved running of simulations.
27
def floatify(np):
28
    return [float(np[i]) for i in range(len(np))]
29
    
30
def standalone_headless_isolated(conn,vis):
31
    e = RunEnv(visualize=vis)
32
    
33
    while True:
34
        try:
35
            msg = conn.recv()
36
37
            # messages should be tuples,
38
            # msg[0] should be string
39
40
            if msg[0] == 'reset':
41
                o = e.reset(difficulty=2)
42
                conn.send(o)
43
            elif msg[0] == 'step':
44
                ordi = e.step(msg[1])
45
                conn.send(ordi)
46
            else:
47
                conn.close()
48
                del e
49
                return
50
        except:
51
            conn.close()
52
            del e
53
            raise
54
55
# class that manages the interprocess communication and expose itself as a RunEnv.
56
class ei: # Environment Instance
57
    def __init__(self,vis):
58
        self.pc, self.cc = Pipe()
59
        self.p = Process(
60
            target = standalone_headless_isolated,
61
            args=(self.cc,vis,)
62
        )
63
        self.p.daemon = True
64
        self.p.start()
65
66
    def reset(self):
67
        self.pc.send(('reset',))
68
        return self.pc.recv()
69
70
    def step(self,actions):
71
        self.pc.send(('step',actions,))
72
        try:
73
        return self.pc.recv()
74
    except EOFError:  
75
            return None
76
77
    def __del__(self):
78
        self.pc.send(('exit',))
79
        #print('(ei)waiting for join...')
80
        self.p.join()
81
82
# Added by Andrew Liao
83
# for NoisyNet-DQN (using Factorised Gaussian noise)
84
# modified from ```dense``` function
85
def sample_noise(shape):
86
    noise = np.random.normal(size=shape).astype(np.float32)
87
    #noise = np.ones(size=shape).astype(np.float32) # whenever not in training, simply return a matrix of ones. 
88
    return noise
89
    
90
global_p_a = 0.
91
global_q_a = 0.
92
global_p_v = 0.
93
global_q_v = 0.
94
    
95
def noisy_dense(x, size, name, bias=True, activation_fn=tf.identity, factorized=False):
96
97
    global global_p_a
98
    global global_q_a
99
    global global_p_v
100
    global global_q_v
101
    # https://arxiv.org/pdf/1706.10295.pdf page 4
102
    # the function used in eq.7,8 : f(x)=sgn(x)*sqrt(|x|)
103
    def f(x):
104
        return tf.multiply(tf.sign(x), tf.pow(tf.abs(x), 0.5))
105
    # Initializer of \mu and \sigma 
106
    mu_init = tf.random_uniform_initializer(minval=-1*1/np.power(x.get_shape().as_list()[1], 0.5),     
107
                                                maxval=1*1/np.power(x.get_shape().as_list()[1], 0.5))
108
    sigma_init = tf.constant_initializer(0.4/np.power(x.get_shape().as_list()[1], 0.5))
109
    # Sample noise from gaussian
110
    if name == 'global':
111
        if size == 18: # check condition
112
            p = sample_noise([128, 18]) # 256 is rnn_size
113
            global_p_a = p
114
            q = sample_noise([1, 18]) # 3 is action size
115
            global_q_a = q
116
        else:
117
            p = sample_noise([128, 1]) # 256 is rnn_size
118
            global_p_v = p
119
            q = sample_noise([1, 1]) # 1 is value size
120
            global_q_v = q
121
    else: # for actors, copy p & q from the global network
122
        if size == 3: # check condition
123
            p = global_p_a
124
            q = global_q_a
125
        else:
126
            p = global_p_v
127
            q = global_q_v
128
    
129
    f_p = f(p); f_q = f(q)
130
    w_epsilon = f_p*f_q; b_epsilon = tf.squeeze(f_q)
131
    if not factorized: # just resample the noisy matrix to get independent guassian noise
132
        w_epsilon = tf.identity(sample_noise(w_epsilon.get_shape().as_list()))
133
    # w = w_mu + w_sigma*w_epsilon
134
    options = {18:'action',1:'value'}
135
    w_mu = tf.get_variable(name + "/w_mu" + options[size], [x.get_shape()[1], size], initializer=mu_init)
136
    w_sigma = tf.get_variable(name + "/w_sigma" + options[size], [x.get_shape()[1], size], initializer=sigma_init)
137
    w = w_mu + tf.multiply(w_sigma, w_epsilon)
138
    ret = tf.matmul(x, w)
139
    if bias:
140
        # b = b_mu + b_sigma*b_epsilon
141
        b_mu = tf.get_variable(name + "/b_mu" + options[size], [size], initializer=mu_init)
142
        b_sigma = tf.get_variable(name + "/b_sigma" + options[size], [size], initializer=sigma_init)
143
        b = b_mu + tf.multiply(b_sigma, b_epsilon)
144
        return activation_fn(ret + b)
145
    else:
146
        return activation_fn(ret)
147
148
# ================================================================
149
# Model components
150
# ================================================================
151
152
# Actor Network------------------------------------------------------------------------------------------------------------
153
class AC_Network():
154
    def __init__(self,s_size,a_size,scope,trainer,noisy):
155
        with tf.variable_scope(scope):
156
            #Input and visual encoding layers
157
            self.inputs = tf.placeholder(shape=[None,s_size],dtype=tf.float32)
158
            self.imageIn = tf.reshape(self.inputs,shape=[-1,s_size,1])
159
            
160
            # Create the model, use the default arg scope to configure the batch norm parameters.
161
             '''   conv1 = tf.nn.elu(tf.nn.conv1d(self.imageIn,tf.truncated_normal([2,1,8],stddev=0.1),2,padding='VALID'))
162
                conv2 = tf.nn.elu(tf.nn.conv1d(conv1,tf.truncated_normal([3,8,16],stddev=0.05),1,padding='VALID'))
163
            
164
            hidden = slim.fully_connected(slim.flatten(conv2),200,activation_fn=tf.nn.elu)'''
165
            
166
            hidden = slim.fully_connected(slim.flatten(self.imageIn),300,activation_fn=tf.elu)
167
            #Recurrent network for temporal dependencies
168
            lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(128,dropout_keep_prob=0.8)
169
            #lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell,output_keep_prob=0.5)
170
            c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
171
            h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
172
            self.state_init = [c_init, h_init]
173
            c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
174
            h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
175
            self.state_in = (c_in, h_in)
176
            rnn_in = tf.expand_dims(hidden, [0])
177
            step_size = tf.shape(self.imageIn)[:1]
178
            state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
179
            lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
180
                lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size,
181
                time_major=False)
182
            lstm_c, lstm_h = lstm_state
183
            self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
184
            rnn_out = tf.reshape(lstm_outputs, [-1, 128])
185
            
186
            if noisy:
187
                # Apply noisy network on fully connected layers
188
                # ref: https://arxiv.org/abs/1706.10295
189
                self.policy = tf.clip_by_value(noisy_dense(rnn_out,name=scope, size=a_size, activation_fn=tf.nn.relu),0.0,1.0)
190
                self.value = noisy_dense(rnn_out,name=scope, size=1) # default activation_fn=tf.identity
191
            else:
192
                #Output layers for policy and value estimations
193
                mu = slim.fully_connected(rnn_out,a_size,activation_fn=tf.nn.elu,weights_initializer=normalized_columns_initializer(0.01),biases_initializer=None)
194
                #var = slim.fully_connected(rnn_out,a_size,activation_fn=tf.nn.softplus,weights_initializer=normalized_columns_initializer(0.01),biases_initializer=None)
195
                self.normal_dist = tf.contrib.distributions.Normal(mu, 0.05)
196
                self.policy = tf.clip_by_value(self.normal_dist.sample(1),0.0,1.0) # self.normal_dist.sample(1)
197
                self.value = slim.fully_connected(rnn_out,1,
198
                    activation_fn=None,
199
                    weights_initializer=normalized_columns_initializer(1.0),
200
                    biases_initializer=None)
201
                    
202
            #Only the worker network need ops for loss functions and gradient updating.
203
            if scope != 'global':
204
                self.actions = tf.placeholder(shape=[None,a_size],dtype=tf.float32)
205
                self.target_v = tf.placeholder(shape=[None],dtype=tf.float32)
206
                self.advantages = tf.placeholder(shape=[None],dtype=tf.float32)
207
208
                #Loss functions
209
                self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1])))
210
                self.log_prob = tf.reduce_sum(self.normal_dist.log_prob(self.actions),axis=1)
211
                self.entropy = tf.reduce_sum(self.normal_dist.entropy(),axis=1)  # encourage exploration
212
                self.entropy = tf.reduce_sum(self.entropy,axis=0)
213
                self.policy_loss = -tf.reduce_sum(self.log_prob*self.advantages,axis=0)
214
                if noisy:
215
                    self.loss = 0.5 * self.value_loss + self.policy_loss
216
                else:
217
                    self.loss = 0.5 * self.value_loss + self.policy_loss #- 0.01 * self.entropy
218
219
                #Get gradients from local network using local losses
220
                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
221
                self.gradients = tf.gradients(self.loss,local_vars)
222
                self.var_norms = tf.global_norm(local_vars)
223
                grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0)
224
225
                #Apply local gradients to global network
226
                #Comment these two lines out to stop training
227
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
228
                self.apply_grads = trainer.apply_gradients(zip(grads,global_vars))
229
                
230
# Learning to run Worker------------------------------------------------------------------------------------------------------------
231
class Worker():
232
    def __init__(self,name,s_size,a_size,trainer,model_path,global_episodes,noisy,is_training):
233
        self.name = "worker_" + str(name)
234
        self.number = name        
235
        self.model_path = model_path
236
        self.trainer = trainer
237
        self.global_episodes = global_episodes
238
        self.increment = self.global_episodes.assign_add(1)
239
        self.episode_rewards = []
240
        self.episode_lengths = []
241
        self.episode_mean_values = []
242
        self.summary_writer = tf.summary.FileWriter("train_"+str(self.number))
243
        self.noisy = noisy
244
        self.is_training = is_training
245
246
        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
247
        self.local_AC = AC_Network(s_size,a_size,self.name,trainer,noisy)
248
        #self.local_AC_target = AC_Network(s_size,a_size,self.name+'/target',trainer,noisy)
249
        self.update_local_ops = update_target_graph('global',self.name)
250
        #self.update_local_ops_target = update_target_graph('global/target',self.name+'/target')
251
        #self.update_global_target = update_target_network(self.name,'global/target')           
252
        
253
        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
254
        self.exploration_noise = OUNoise(a_size)
255
        
256
    def train(self,rollout,sess,gamma,bootstrap_value):
257
        rollout = np.array(rollout)
258
        observations = rollout[:,0]
259
        actions = rollout[:,1]
260
        rewards = rollout[:,2]
261
        # reward clipping:  scale and clip the values of the rewards to the range -1,+1
262
        #rewards = (rewards - np.mean(rewards)) / np.max(abs(rewards))
263
264
        next_observations = rollout[:,3] # Aug 1st, notice next observation is never used
265
        values = rollout[:,5]
266
        
267
        # Here we take the rewards and values from the rollout, and use them to 
268
        # generate the advantage and discounted returns. 
269
        # The advantage function uses "Generalized Advantage Estimation"
270
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
271
        discounted_rewards = discount(self.rewards_plus,gamma)[:-1]
272
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
273
        advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
274
        advantages = discount(advantages,gamma)
275
276
        # Update the global network using gradients from loss
277
        # Generate network statistics to periodically save
278
        rnn_state = self.local_AC.state_init
279
        feed_dict = {self.local_AC.target_v:discounted_rewards,
280
            self.local_AC.inputs:np.vstack(observations),
281
            self.local_AC.actions:np.vstack(actions),
282
            self.local_AC.advantages:advantages,
283
            self.local_AC.state_in[0]:rnn_state[0],
284
            self.local_AC.state_in[1]:rnn_state[1]}
285
        l,v_l,p_l,e_l,g_n,v_n,_ = sess.run([self.local_AC.loss,self.local_AC.value_loss,
286
            self.local_AC.policy_loss,
287
            self.local_AC.entropy,
288
            self.local_AC.grad_norms,
289
            self.local_AC.var_norms,
290
            self.local_AC.apply_grads],
291
            feed_dict=feed_dict)
292
        return l / len(rollout), v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n
293
        
294
    def work(self,max_episode_length,gamma,sess,coord,saver):
295
        if self.is_training:
296
            episode_count = sess.run(self.global_episodes)
297
        else:
298
            episode_count = 0
299
        wining_episode_count = 0
300
        total_steps = 0
301
        print ("Starting worker " + str(self.number))
302
        with open('result.txt','w') as f:
303
            f.write(strftime("Starting time: %a, %d %b %Y %H:%M:%S\n", gmtime()))
304
    
305
        explore = 1000
306
307
        if self.name == 'worker_1':
308
            self.env = ei(vis=False)#RunEnv(visualize=True)
309
        else:
310
            self.env = ei(vis=False)#RunEnv(visualize=False)
311
312
        with sess.as_default(), sess.graph.as_default():
313
            #not_start_training_yet = True
314
            while not coord.should_stop():
315
                # start the env (in the thread) every 50 eps to prevent memory leak
316
                if episode_count % 50 == 0:
317
                    if self.env != None:
318
                        del self.env
319
                    if self.name == 'worker_1':
320
                        self.env = ei(vis=True)#RunEnv(visualize=True)
321
                    else:
322
                        self.env = ei(vis=False)#RunEnv(visualize=False)
323
                self.setting=2
324
                        
325
                sess.run(self.update_local_ops)
326
                #sess.run(self.update_local_ops_target)
327
                episode_buffer = []
328
                episode_values = []
329
                episode_reward = 0
330
                episode_step_count = 0
331
                done = False
332
                
333
                seed = np.random.rand()
334
                
335
                self.env.reset()
336
                # engineered initial input to make agent's life easier
337
                a=engineered_action(seed)
338
                ob = self.env.step(a)[0]
339
                s = ob
340
                ob = self.env.step(a)[0]
341
                s1 = ob
342
                s = process_state(s,s1)
343
                rnn_state = self.local_AC.state_init
344
                explore -= 1
345
                #st = time()
346
                chese=0
347
                while done == False:
348
                    #Take an action using probabilities from policy network output.
349
                    action,v,rnn_state = sess.run([self.local_AC.policy,self.local_AC.value,self.local_AC.state_out], 
350
                                feed_dict={self.local_AC.inputs:[s],
351
                                self.local_AC.state_in[0]:rnn_state[0],
352
                                self.local_AC.state_in[1]:rnn_state[1]})
353
                    if not (episode_count % 5 == 0 and self.name == 'worker_1') and self.is_training:
354
                        if explore > 0: # > 0 turn on OU_noise # test the agent every 2 eps
355
                            a = np.clip(action[0,0]+self.exploration_noise.noise(),0.0,1.0)
356
                        else:
357
                            a = action[0,0]
358
                        if chese < 60 and episode_count < 250:
359
                            a=engineered_action(seed)
360
                            chese += 1
361
                    else:
362
                        a = action[0,0]
363
                    ob,r,done,_ = self.env.step(a)
364
                    '''
365
                    if self.name == 'worker_0':
366
                        ct = time()
367
                        print(ct-st)
368
                        st = ct
369
                    '''
370
                    if done == False:
371
                        s2 = ob
372
                    else:
373
                        s2 = s1
374
                    s1 = process_state(s1,s2)
375
                    #print(s1)    
376
                    episode_buffer.append([s,a,r,s1,done,v[0,0]])
377
                    episode_values.append(v[0,0])
378
379
                    episode_reward += r
380
                    s = s1
381
                    s1 = s2
382
                    total_steps += 1
383
                    episode_step_count += 1
384
                            
385
                    # If the episode hasn't ended, but the experience buffer is full, then we
386
                    # make an update step using that experience rollout.
387
                    '''        
388
                    if len(episode_buffer) == 120 and done != True and episode_step_count != max_episode_length - 1: # change pisode length to 5, and try to modify Worker.train() function to utilize the next frame to train imagined frame.
389
                        # Since we don't know what the true final return is, we "bootstrap" from our current
390
                        # value estimation.
391
                        if self.is_training:
392
                            v1 = sess.run(self.local_AC.value, 
393
                            feed_dict={self.local_AC.inputs:[s],
394
                            self.local_AC.state_in[0]:rnn_state[0],
395
                            self.local_AC.state_in[1]:rnn_state[1]})[0,0]
396
                            l,v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,v1)
397
                            sess.run(self.update_local_ops)
398
                            episode_buffer = []
399
                    ''' 
400
                    if done == True:
401
                        break
402
                           
403
                self.episode_rewards.append(episode_reward)
404
                self.episode_lengths.append(episode_step_count)
405
                self.episode_mean_values.append(np.mean(episode_values))
406
                    
407
                # Update the network using the experience buffer at the end of the episode.
408
                if len(episode_buffer) != 0:
409
                    if self.is_training:
410
                        l,v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0)
411
                        #print(l,v_l,p_l,e_l,g_n,v_n)
412
                        #sess.run(self.update_global_target)
413
                                    
414
                        
415
                # Periodically save gifs of episodes, model parameters, and summary statistics.
416
                if episode_count % 5 == 0 and episode_count != 0:
417
                    mean_reward = np.mean(self.episode_rewards[-5:])
418
                    mean_length = np.mean(self.episode_lengths[-5:])
419
                    mean_value = np.mean(self.episode_mean_values[-5:])
420
                    summary = tf.Summary()
421
                    summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
422
                    summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
423
                    summary.value.add(tag='Perf/Value', simple_value=float(mean_value))
424
                    if self.is_training:
425
                        summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l))
426
                        summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l))
427
                        summary.value.add(tag='Losses/Entropy', simple_value=float(e_l))
428
                    self.summary_writer.add_summary(summary, episode_count)
429
                    self.summary_writer.flush()
430
                    if self.name == 'worker_1':
431
            with open('result.txt','a') as f:
432
                            f.write("Episode "+str(episode_count)+" reward (testing): %.2f\n" % episode_reward)
433
                    if self.name == 'worker_0':
434
            with open('result.txt','a') as f:
435
                f.write("Episodes "+str(episode_count)+" mean reward (training): %.2f\n" % mean_reward)
436
437
                        if episode_count % 100 == 0:
438
                            saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk')
439
                            with open('result.txt','a') as f:
440
                    f.write("Saved Model at episode: "+str(episode_count)+"\n")
441
                if self.name == 'worker_0' and self.is_training:
442
                    sess.run(self.increment)
443
                        
444
                episode_count += 1
445
                    
446
                if self.name == "worker_1" and episode_reward > 2.:
447
                    wining_episode_count += 1
448
                    print('Worker_1 is stepping forward in Episode {}! Reward: {:.2f}. Total percentage of success is: {}%'.format(episode_count, episode_reward, int(wining_episode_count / episode_count * 100)))
449
                    with open('result.txt','a') as f:
450
            f.wirte('Worker_1 is stepping forward in Episode {}! Reward: {:.2f}. Total percentage of success is: {}%\n'.format(episode_count, episode_reward, int(wining_episode_count / episode_count * 100)))
451
        
452
        # All done Stop trail
453
        # Confirm exit
454
        print('Exit/Done '+self.name)