Diff of /ddpg/gym_ddpg.py [000000] .. [687a25]

Switch to unified view

a b/ddpg/gym_ddpg.py
1
from ddpg import *
2
import opensim as osim
3
from osim.http.client import Client
4
from osim.env import *
5
6
ENV_NAME = 'learning_to_run'
7
PATH = 'models/'
8
EPISODES = 100000
9
TEST = 1
10
11
def main():
12
    env = ei(True,seed=0,diff=0)
13
    env.reset()
14
    agent = DDPG(env)
15
16
    returns = []
17
    rewards = []
18
19
    rs = RunningStats()
20
21
    for episode in xrange(EPISODES):
22
        state = env.reset()
23
        reward_episode = []
24
        print "episode:",episode
25
        # Train
26
        demo = 50
27
        n_step = 3
28
        s,s1 = [],[]
29
        ea = engineered_action(np.random.rand())
30
        if np.random.rand() < 0.5:
31
            for i in range(demo):
32
                ob = env.step(ea)[0]
33
        ob = env.step(ea)[0]
34
        s = ob
35
        ob = env.step(ea)[0]
36
        s1 = ob
37
        s = process_state(s,s1,center=True) #s, which stands for state, is the new ob
38
        rs.normalize(s)
39
        for step in xrange(1000):
40
            ac = agent.action(s)
41
            print(ac)
42
            ac = np.clip(ac + agent.exploration_noise.noise(),0.05,0.95)
43
            temp = 0
44
            for i in range(n_step):
45
                ob, rew, new, _ = env.step(ac+agent.exploration_noise.noise()*0.2,0.05,0.95))
46
                rew = (rew/0.01 + int(new) * 0.1 + int((ob[2]/0.70)<1.0) * -1.)
47
                temp += rew
48
                if new: 
49
                    break
50
                s1 = ob
51
            rew = temp
52
            print(rew)
53
            s1 = process_state(s1,ob,center=True)
54
            rs.normalize(s1)
55
            agent.perceive(s,ac,rew,s1,new)
56
            s = s1
57
            s1 = ob
58
            reward_episode.append(rew)
59
            if new:
60
                break
61
62
63
        if episode % 5 == 0:
64
            print("episode reward = %.2f" % sum(reward_episode))
65
        # Testing:
66
        #if episode % 1 == 0:
67
        if episode % 100 == 0 and episode > 50:
68
            agent.save_model(PATH, episode)
69
70
            total_return = 0
71
            ave_reward = 0
72
            for i in xrange(TEST):
73
                state = env.reset()
74
                reward_per_step = 0
75
                for i in range(demo):
76
                    ob = env.step(ea)[0]
77
                ob = env.step(ea)[0]
78
                s = ob
79
                ob = env.step(ea)[0]
80
                s1 = ob
81
                s = process_state(s,s1,center=True) #s, which stands for state, is the new ob
82
                rs.normalize(s)
83
                for j in xrange(1000):
84
                    ac = agent.action(s)
85
                    temp = 0
86
                    for i in range(n_step):
87
                        ob, rew, new, _ = env.step(ac)
88
                        rew = (rew/0.01 + int(new) * 0.1 + int((ob[2]/0.70)<1.0) * -1.)
89
                        temp += rew
90
                        if new: 
91
                            break
92
                        s1 = ob
93
                    rew = temp
94
                    s1 = process_state(s1,ob,center=True)
95
                    rs.normalize(s1)
96
                    s = s1
97
                    s1 = ob
98
                    total_return += rew
99
                    if new:
100
                        break
101
                    reward_per_step += (rew - reward_per_step)/(j+1)
102
                ave_reward += reward_per_step
103
104
            ave_return = total_return/TEST
105
            ave_reward = ave_reward/TEST
106
            returns.append(ave_return)
107
            rewards.append(ave_reward)
108
109
            print 'episode: ',episode,'Evaluation Average Return:',ave_return, '  Evaluation Average Reward: ', ave_reward
110
111
if __name__ == '__main__':
112
    main()