[687a25]: / ddpg / gym_ddpg.py

Download this file

113 lines (101 with data), 3.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
from ddpg import *
import opensim as osim
from osim.http.client import Client
from osim.env import *
ENV_NAME = 'learning_to_run'
PATH = 'models/'
EPISODES = 100000
TEST = 1
def main():
env = ei(True,seed=0,diff=0)
env.reset()
agent = DDPG(env)
returns = []
rewards = []
rs = RunningStats()
for episode in xrange(EPISODES):
state = env.reset()
reward_episode = []
print "episode:",episode
# Train
demo = 50
n_step = 3
s,s1 = [],[]
ea = engineered_action(np.random.rand())
if np.random.rand() < 0.5:
for i in range(demo):
ob = env.step(ea)[0]
ob = env.step(ea)[0]
s = ob
ob = env.step(ea)[0]
s1 = ob
s = process_state(s,s1,center=True) #s, which stands for state, is the new ob
rs.normalize(s)
for step in xrange(1000):
ac = agent.action(s)
print(ac)
ac = np.clip(ac + agent.exploration_noise.noise(),0.05,0.95)
temp = 0
for i in range(n_step):
ob, rew, new, _ = env.step(ac+agent.exploration_noise.noise()*0.2,0.05,0.95))
rew = (rew/0.01 + int(new) * 0.1 + int((ob[2]/0.70)<1.0) * -1.)
temp += rew
if new:
break
s1 = ob
rew = temp
print(rew)
s1 = process_state(s1,ob,center=True)
rs.normalize(s1)
agent.perceive(s,ac,rew,s1,new)
s = s1
s1 = ob
reward_episode.append(rew)
if new:
break
if episode % 5 == 0:
print("episode reward = %.2f" % sum(reward_episode))
# Testing:
#if episode % 1 == 0:
if episode % 100 == 0 and episode > 50:
agent.save_model(PATH, episode)
total_return = 0
ave_reward = 0
for i in xrange(TEST):
state = env.reset()
reward_per_step = 0
for i in range(demo):
ob = env.step(ea)[0]
ob = env.step(ea)[0]
s = ob
ob = env.step(ea)[0]
s1 = ob
s = process_state(s,s1,center=True) #s, which stands for state, is the new ob
rs.normalize(s)
for j in xrange(1000):
ac = agent.action(s)
temp = 0
for i in range(n_step):
ob, rew, new, _ = env.step(ac)
rew = (rew/0.01 + int(new) * 0.1 + int((ob[2]/0.70)<1.0) * -1.)
temp += rew
if new:
break
s1 = ob
rew = temp
s1 = process_state(s1,ob,center=True)
rs.normalize(s1)
s = s1
s1 = ob
total_return += rew
if new:
break
reward_per_step += (rew - reward_per_step)/(j+1)
ave_reward += reward_per_step
ave_return = total_return/TEST
ave_reward = ave_reward/TEST
returns.append(ave_return)
rewards.append(ave_reward)
print 'episode: ',episode,'Evaluation Average Return:',ave_return, ' Evaluation Average Reward: ', ave_reward
if __name__ == '__main__':
main()