|
a |
|
b/ddpg/gym_ddpg.py |
|
|
1 |
from ddpg import * |
|
|
2 |
import opensim as osim |
|
|
3 |
from osim.http.client import Client |
|
|
4 |
from osim.env import * |
|
|
5 |
|
|
|
6 |
ENV_NAME = 'learning_to_run' |
|
|
7 |
PATH = 'models/' |
|
|
8 |
EPISODES = 100000 |
|
|
9 |
TEST = 1 |
|
|
10 |
|
|
|
11 |
def main(): |
|
|
12 |
env = ei(True,seed=0,diff=0) |
|
|
13 |
env.reset() |
|
|
14 |
agent = DDPG(env) |
|
|
15 |
|
|
|
16 |
returns = [] |
|
|
17 |
rewards = [] |
|
|
18 |
|
|
|
19 |
rs = RunningStats() |
|
|
20 |
|
|
|
21 |
for episode in xrange(EPISODES): |
|
|
22 |
state = env.reset() |
|
|
23 |
reward_episode = [] |
|
|
24 |
print "episode:",episode |
|
|
25 |
# Train |
|
|
26 |
demo = 50 |
|
|
27 |
n_step = 3 |
|
|
28 |
s,s1 = [],[] |
|
|
29 |
ea = engineered_action(np.random.rand()) |
|
|
30 |
if np.random.rand() < 0.5: |
|
|
31 |
for i in range(demo): |
|
|
32 |
ob = env.step(ea)[0] |
|
|
33 |
ob = env.step(ea)[0] |
|
|
34 |
s = ob |
|
|
35 |
ob = env.step(ea)[0] |
|
|
36 |
s1 = ob |
|
|
37 |
s = process_state(s,s1,center=True) #s, which stands for state, is the new ob |
|
|
38 |
rs.normalize(s) |
|
|
39 |
for step in xrange(1000): |
|
|
40 |
ac = agent.action(s) |
|
|
41 |
print(ac) |
|
|
42 |
ac = np.clip(ac + agent.exploration_noise.noise(),0.05,0.95) |
|
|
43 |
temp = 0 |
|
|
44 |
for i in range(n_step): |
|
|
45 |
ob, rew, new, _ = env.step(ac+agent.exploration_noise.noise()*0.2,0.05,0.95)) |
|
|
46 |
rew = (rew/0.01 + int(new) * 0.1 + int((ob[2]/0.70)<1.0) * -1.) |
|
|
47 |
temp += rew |
|
|
48 |
if new: |
|
|
49 |
break |
|
|
50 |
s1 = ob |
|
|
51 |
rew = temp |
|
|
52 |
print(rew) |
|
|
53 |
s1 = process_state(s1,ob,center=True) |
|
|
54 |
rs.normalize(s1) |
|
|
55 |
agent.perceive(s,ac,rew,s1,new) |
|
|
56 |
s = s1 |
|
|
57 |
s1 = ob |
|
|
58 |
reward_episode.append(rew) |
|
|
59 |
if new: |
|
|
60 |
break |
|
|
61 |
|
|
|
62 |
|
|
|
63 |
if episode % 5 == 0: |
|
|
64 |
print("episode reward = %.2f" % sum(reward_episode)) |
|
|
65 |
# Testing: |
|
|
66 |
#if episode % 1 == 0: |
|
|
67 |
if episode % 100 == 0 and episode > 50: |
|
|
68 |
agent.save_model(PATH, episode) |
|
|
69 |
|
|
|
70 |
total_return = 0 |
|
|
71 |
ave_reward = 0 |
|
|
72 |
for i in xrange(TEST): |
|
|
73 |
state = env.reset() |
|
|
74 |
reward_per_step = 0 |
|
|
75 |
for i in range(demo): |
|
|
76 |
ob = env.step(ea)[0] |
|
|
77 |
ob = env.step(ea)[0] |
|
|
78 |
s = ob |
|
|
79 |
ob = env.step(ea)[0] |
|
|
80 |
s1 = ob |
|
|
81 |
s = process_state(s,s1,center=True) #s, which stands for state, is the new ob |
|
|
82 |
rs.normalize(s) |
|
|
83 |
for j in xrange(1000): |
|
|
84 |
ac = agent.action(s) |
|
|
85 |
temp = 0 |
|
|
86 |
for i in range(n_step): |
|
|
87 |
ob, rew, new, _ = env.step(ac) |
|
|
88 |
rew = (rew/0.01 + int(new) * 0.1 + int((ob[2]/0.70)<1.0) * -1.) |
|
|
89 |
temp += rew |
|
|
90 |
if new: |
|
|
91 |
break |
|
|
92 |
s1 = ob |
|
|
93 |
rew = temp |
|
|
94 |
s1 = process_state(s1,ob,center=True) |
|
|
95 |
rs.normalize(s1) |
|
|
96 |
s = s1 |
|
|
97 |
s1 = ob |
|
|
98 |
total_return += rew |
|
|
99 |
if new: |
|
|
100 |
break |
|
|
101 |
reward_per_step += (rew - reward_per_step)/(j+1) |
|
|
102 |
ave_reward += reward_per_step |
|
|
103 |
|
|
|
104 |
ave_return = total_return/TEST |
|
|
105 |
ave_reward = ave_reward/TEST |
|
|
106 |
returns.append(ave_return) |
|
|
107 |
rewards.append(ave_reward) |
|
|
108 |
|
|
|
109 |
print 'episode: ',episode,'Evaluation Average Return:',ave_return, ' Evaluation Average Reward: ', ave_reward |
|
|
110 |
|
|
|
111 |
if __name__ == '__main__': |
|
|
112 |
main() |