Diff of /ADDPG/model.py [000000] .. [687a25]

Switch to unified view

a b/ADDPG/model.py
1
# -----------------------------------
2
# Deep Deterministic Policy Gradient
3
# Author: Kaizhao Liang, Hang Yu
4
# Date: 08.21.2017
5
# -----------------------------------
6
import tensorflow as tf
7
import numpy as np
8
from ou_noise import OUNoise
9
from critic_network import CriticNetwork
10
from actor_network import ActorNetwork
11
from replay_buffer import ReplayBuffer
12
from helper import *
13
from time import gmtime, strftime, sleep
14
15
import opensim as osim
16
from osim.http.client import Client
17
from osim.env import *
18
19
import multiprocessing
20
from multiprocessing import Process, Pipe
21
22
# [Hacked] the memory might always be leaking, here's a solution #58
23
# https://github.com/stanfordnmbl/osim-rl/issues/58 
24
# separate process that holds a separate RunEnv instance.
25
# This has to be done since RunEnv() in the same process result in interleaved running of simulations.
26
def standalone_headless_isolated(conn,vis):
27
    e = RunEnv(visualize=vis)
28
    while True:
29
        try:
30
            msg = conn.recv()
31
32
            # messages should be tuples,
33
            # msg[0] should be string
34
35
            if msg[0] == 'reset':
36
                o = e.reset(difficulty=2)
37
                conn.send(o)
38
            elif msg[0] == 'step':
39
                ordi = e.step(msg[1])
40
                conn.send(ordi)
41
            else:
42
                conn.close()
43
                del e
44
                return
45
        except:
46
            conn.close()
47
            del e
48
            raise
49
50
# class that manages the interprocess communication and expose itself as a RunEnv.
51
class ei: # Environment Instance
52
    def __init__(self,vis):
53
        self.pc, self.cc = Pipe()
54
        self.p = Process(
55
            target = standalone_headless_isolated,
56
            args=(self.cc,vis,)
57
        )
58
        self.p.daemon = True
59
        self.p.start()
60
61
    def reset(self):
62
        self.pc.send(('reset',))
63
        return self.pc.recv()
64
65
    def step(self,actions):
66
        self.pc.send(('step',actions,))
67
        try:
68
            return self.pc.recv()
69
        except :  
70
            print('Error in recv()')
71
            raise
72
73
    def __del__(self):
74
        self.pc.send(('exit',))
75
        #print('(ei)waiting for join...')
76
        self.p.join()
77
    try:
78
        del self.pc
79
        del self.cc
80
        del self.p
81
    except:
82
        raise
83
84
###############################################
85
# DDPG Worker
86
###############################################
87
pause_perceive = False
88
replay_buffer = ReplayBuffer(1e6)
89
90
class Worker:
91
    """docstring for DDPG"""
92
    def __init__(self,sess,number,model_path,global_episodes,explore,training,vis,batch_size,gamma,n_step,global_actor_net):
93
        self.name = 'worker_' + str(number) # name for uploading results
94
        self.number = number
95
        # Randomly initialize actor network and critic network
96
        # with both their target networks
97
        self.state_dim = 41+3+14 # 41 observations plus 17 induced velocity
98
        self.action_dim = 18
99
        self.model_path= model_path
100
        self.global_episodes = global_episodes
101
        self.increment = self.global_episodes.assign_add(1)
102
        self.sess = sess
103
        self.explore = explore
104
        self.noise_decay = 1.
105
        self.training = training
106
        self.vis = vis # == True only during testing
107
        self.total_steps = 0 # for ReplayBuffer to count
108
        self.batch_size = batch_size
109
        self.gamma = gamma
110
        self.n_step = n_step
111
        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
112
        self.exploration_noise = OUNoise(self.action_dim)
113
        
114
        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim,self.name+'/actor')
115
        self.update_local_actor = update_graph(global_actor_net,self.actor_network.net)
116
        if self.name == 'worker_1':
117
            self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim,self.name+'/critic')
118
            self.actor_network.update_target(sess)
119
            self.critic_network.update_target(sess)
120
            self.update_global_actor = update_graph(self.actor_network.net,global_actor_net)
121
122
    def start(self):
123
        self.env = ei(vis=self.vis)#RunEnv(visualize=True)
124
            
125
    def restart(self): # restart env every ? eps to coutner memory leak
126
        if self.env != None:
127
            del self.env
128
        sleep(0.001)
129
        self.env = ei(vis=self.vis)
130
131
    def train(self):
132
        # print "train step",self.time_step
133
        # Sample a random minibatch of N transitions from replay buffer
134
        global replay_buffer
135
        minibatch = replay_buffer.get_batch(self.batch_size)
136
137
    BATCH_SIZE = self.batch_size
138
    #print(self.batch_size)
139
        state_batch = np.asarray([data[0] for data in minibatch])
140
        
141
        action_batch = np.asarray([data[1] for data in minibatch])
142
        reward_batch = np.asarray([data[2] for data in minibatch])
143
144
        next_state_batch = np.asarray([data[3] for data in minibatch])
145
        done_batch = np.asarray([data[4] for data in minibatch])
146
147
        # for action_dim = 1
148
        action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim])
149
150
        # Calculate y_batch
151
        next_action_batch = self.actor_network.target_actions(self.sess,next_state_batch)
152
        q_value_batch = self.critic_network.target_q(self.sess,next_state_batch,next_action_batch)
153
154
        done_mask = np.asarray([0. if done else 1. for done in done_batch])
155
        y_batch = reward_batch + self.gamma**self.n_step * q_value_batch * done_mask
156
        y_batch = np.resize(y_batch,[BATCH_SIZE,1])
157
        # Update critic by minimizing the loss L
158
        _,loss,a,b,norm = self.critic_network.train(self.sess,y_batch,state_batch,action_batch)
159
        #print(a)
160
        #print(b)
161
        #print(loss)
162
        #print(norm)
163
164
        # Update the actor policy using the sampled gradient:
165
        action_batch_for_gradients = self.actor_network.actions(self.sess,state_batch)
166
        q_gradient_batch = self.critic_network.gradients(self.sess,state_batch,action_batch_for_gradients)
167
        q_gradient_batch *= -1.
168
        '''
169
        # invert gradient formula : dq = (a_max-a) / (a_max - a_min) if dq>0, else dq = (a - a_min) / (a_max - a_min)
170
        for i in range(BATCH_SIZE): # In our case a_max = 1, a_min = 0
171
            for j in range(18):
172
                dq = q_gradient_batch[i,j]
173
                a = action_batch_for_gradients[i,j]
174
                if dq > 0.:
175
                    q_gradient_batch[i,j] *= (0.95-a)
176
                else:
177
                    q_gradient_batch[i,j] *= a-0.05'''
178
                    
179
        _,norm = self.actor_network.train(self.sess,q_gradient_batch,state_batch)
180
        #print(norm)
181
        # Update the networks
182
        self.actor_network.update_target(self.sess)
183
        self.critic_network.update_target(self.sess)
184
        self.sess.run(self.update_global_actor)
185
186
187
    def save_model(self, saver, episode):
188
        saver.save(self.sess, self.model_path + "/model-" + str(episode) + ".ckpt")
189
190
    def noise_action(self,state):
191
        action = self.actor_network.action(self.sess,state)
192
        return np.clip(action,0.05,0.95)+self.exploration_noise.noise()*self.noise_decay
193
194
    def action(self,state):
195
        action = self.actor_network.action(self.sess,state)
196
        return action
197
198
    def perceive(self,transition):
199
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
200
        global replay_buffer
201
        replay_buffer.add(transition)
202
203
    def work(self,coord,saver):
204
        global replay_buffer
205
        global pause_perceive
206
        
207
        if self.training:
208
            episode_count = self.sess.run(self.global_episodes)
209
        else:
210
            episode_count = 0
211
        wining_episode_count = 0
212
213
        print ("Starting worker_" + str(self.number))
214
        
215
        if self.name == 'worker_0':
216
            with open('result.txt','w') as f:
217
                f.write(strftime("Starting time: %a, %d %b %Y %H:%M:%S\n", gmtime()))
218
        
219
        self.start() # change Aug24 start the env
220
221
        with self.sess.as_default(), self.sess.graph.as_default():
222
            #not_start_training_yet = True
223
            while not coord.should_stop():
224
                    
225
                returns = []
226
                episode_buffer = []
227
                episode_reward = 0
228
                self.noise_decay -= 1./self.explore#np.maximum(abs(np.cos(self.explore / 20 * np.pi)),0.67)
229
                self.explore -= 1
230
                start_training = episode_count > 50 #replay_buffer.count() >= 500e3 # start_training
231
                erase_buffer = False # erase buffer
232
233
                if self.name != "worker_1":
234
                    self.sess.run(self.update_local_actor)
235
                
236
                state = self.env.reset()
237
                seed= 0.1
238
                ea=engineered_action(seed)
239
                
240
                s,s1,s2 = [],[],[]
241
                ob = self.env.step(ea)[0]
242
                s = ob
243
                ob = self.env.step(ea)[0]
244
                s1 = ob
245
                s = process_state(s,s1)
246
                    
247
                if self.name == 'worker_0':
248
                    print("episode:{}".format(str(episode_count)+' '+self.name))
249
                # Train
250
                action = ea
251
                demo = int(50*self.noise_decay)
252
                for step in xrange(1000):
253
                    if self.name == "worker_1" and start_training and self.training:
254
            #pause_perceive=True
255
            #print(self.name+'is training')
256
                        #self.train()
257
                        self.train()
258
            #pause_perceive=False
259
            if erase_buffer:
260
                            pause_perceive = True
261
                            replay_buffer.erase() # erase old experience every time the model is saved
262
                            pause_perceive = False
263
                break
264
                    if demo > 0:
265
                        action = ea
266
                        demo -=1
267
                    elif self.explore>0 and self.training:
268
                        action = np.clip(self.noise_action(s),0.05,0.95) # change Aug20
269
                    else:
270
                        action = np.clip(self.action(s),0.05,0.95)
271
272
                    try:
273
                        s2,reward,done,_ = self.env.step(action)
274
                    except:
275
                        print('Env error. Shutdown {}'.format(self.name))
276
                        if self.env != None:
277
                del self.env
278
                        return 0
279
                    
280
                    s1 = process_state(s1,s2)
281
                    #print(s1)
282
                    if s1[2] > 0.75:
283
                        height_reward = 0.
284
                    else:
285
                        height_reward = -0.05
286
                    if not done:
287
                        ep_reward = 1.005
288
                    else:
289
                        ep_reward = 0.0
290
                    d_head_pelvis = abs(s1[22]-s[1])
291
                    #print(d_head_pelvis)
292
                    if d_head_pelvis > 0.25:
293
                        sta_reward = -0.05
294
                    else:
295
                        sta_reward = 0.
296
                    #print((s1[4]+height_reward+sta_reward)*ep_reward)                        
297
                    episode_buffer.append([s,action,(s1[4]+height_reward+sta_reward)*ep_reward,s1,done])
298
                    if step > self.n_step and not pause_perceive:
299
                        transition = n_step_transition(episode_buffer,self.n_step,self.gamma)
300
                        self.perceive(transition)
301
                        
302
                    s = s1
303
                    s1 = s2
304
                    episode_reward += reward
305
                    if done:
306
                        self.exploration_noise.reset(None)
307
                        break
308
309
                if self.name == 'worker_0' and episode_count % 5 == 0:
310
                    with open('result.txt','a') as f:
311
                        f.write("Episode "+str(episode_count)+" reward (training): %.2f\n" % episode_reward)
312
313
                # Testing:
314
                if self.name == 'worker_2' and episode_count % 10 == 0 and episode_count > 1: # change Aug19
315
                    if episode_count % 25 == 0 and not self.vis:
316
                        self.save_model(saver, episode_count)
317
                        
318
                    total_return = 0
319
                    TEST = 1
320
                    for i in xrange(TEST):
321
                        state = self.env.reset()
322
                        a=engineered_action(seed)
323
                        ob = self.env.step(a)[0]
324
                        s = ob
325
                        ob = self.env.step(a)[0]
326
                        s1 = ob
327
                        s = process_state(s,s1)
328
                        for j in xrange(1000):
329
                            action = self.action(s) # direct action for test
330
                            s2,reward,done,_ = self.env.step(action)
331
                            s1 = process_state(s1,s2)
332
                            s = s1
333
                s1 = s2
334
                            total_return += reward
335
                            if done:
336
                                break
337
338
                    ave_return = total_return/TEST
339
                    returns.append(ave_return)
340
                    with open('result.txt','a') as f:
341
                        f.write('episode: {} Evaluation(testing) Average Return: {}\n'.format(episode_count,ave_return))
342
343
                if self.name == 'worker_0' and self.training:
344
                    self.sess.run(self.increment)
345
                episode_count += 1
346
        if episode_count == 100:
347
            del self.env
348
            # All done Stop trail
349
                    # Confirm exit
350
                    print('Done '+self.name)
351
                    return
352
            # All done Stop trail
353
            # Confirm exit
354
            print('Done '+self.name)
355
            return
356
357
358
359
360
361
362
363