PPO
DPPO介绍
PPO实现
1 """
2 A simple version of Proximal Policy Optimization (PPO) using single thread.
3
4 Based on:
5 1. Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [https://arxiv.org/abs/1707.02286]
6 2. Proximal Policy Optimization Algorithms (OpenAI): [https://arxiv.org/abs/1707.06347]
7
8 View more on my tutorial website: https://morvanzhou.github.io/tutorials
9
10 Dependencies:
11 tensorflow r1.2
12 gym 0.9.2
13 """
14
15 import tensorflow as tf
16 import numpy as np
17 import matplotlib.pyplot as plt
18 import gym
19
20 EP_MAX = 1000
21 EP_LEN = 200
22 GAMMA = 0.9
23 A_LR = 0.0001
24 C_LR = 0.0002
25 BATCH = 32
26 A_UPDATE_STEPS = 10
27 C_UPDATE_STEPS = 10
28 S_DIM, A_DIM = 3, 1
29 METHOD = [
30 dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty
31 dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better
32 ][1] # choose the method for optimization
33
34
35 class PPO(object):
36
37 def __init__(self):
38 self.sess = tf.Session()
39 self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')
40
41 # critic
42 with tf.variable_scope('critic'):
43 l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
44 self.v = tf.layers.dense(l1, 1)
45 self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
46 self.advantage = self.tfdc_r - self.v
47 self.closs = tf.reduce_mean(tf.square(self.advantage))
48 self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)
49
50 # actor
51 pi, pi_params = self._build_anet('pi', trainable=True)
52 oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)
53 with tf.variable_scope('sample_action'):
54 self.sample_op = tf.squeeze(pi.sample(1), axis=0) # choosing action
55 with tf.variable_scope('update_oldpi'):
56 self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
57
58 self.tfa = tf.placeholder(tf.float32, [None, A_DIM], 'action')
59 self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
60 with tf.variable_scope('loss'):
61 with tf.variable_scope('surrogate'):
62 # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
63 ratio = pi.prob(self.tfa) / oldpi.prob(self.tfa)
64 surr = ratio * self.tfadv
65 if METHOD['name'] == 'kl_pen':
66 self.tflam = tf.placeholder(tf.float32, None, 'lambda')
67 kl = tf.distributions.kl_divergence(oldpi, pi)
68 self.kl_mean = tf.reduce_mean(kl)
69 self.aloss = -(tf.reduce_mean(surr - self.tflam * kl))
70 else: # clipping method, find this is better
71 self.aloss = -tf.reduce_mean(tf.minimum(
72 surr,
73 tf.clip_by_value(ratio, 1.-METHOD['epsilon'], 1.+METHOD['epsilon'])*self.tfadv))
74
75 with tf.variable_scope('atrain'):
76 self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
77
78 tf.summary.FileWriter("log/", self.sess.graph)
79
80 self.sess.run(tf.global_variables_initializer())
81
82 def update(self, s, a, r):
83 self.sess.run(self.update_oldpi_op)
84 adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
85 # adv = (adv - adv.mean())/(adv.std()+1e-6) # sometimes helpful
86
87 # update actor
88 if METHOD['name'] == 'kl_pen':
89 for _ in range(A_UPDATE_STEPS):
90 _, kl = self.sess.run(
91 [self.atrain_op, self.kl_mean],
92 {self.tfs: s, self.tfa: a, self.tfadv: adv, self.tflam: METHOD['lam']})
93 if kl > 4*METHOD['kl_target']: # this in in google's paper
94 break
95 if kl < METHOD['kl_target'] / 1.5: # adaptive lambda, this is in OpenAI's paper
96 METHOD['lam'] /= 2
97 elif kl > METHOD['kl_target'] * 1.5:
98 METHOD['lam'] *= 2
99 METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10) # sometimes explode, this clipping is my solution
100 else: # clipping method, find this is better (OpenAI's paper)
101 [self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(A_UPDATE_STEPS)]
102
103 # update critic
104 [self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(C_UPDATE_STEPS)]
105
106 def _build_anet(self, name, trainable):
107 with tf.variable_scope(name):
108 l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu, trainable=trainable)
109 mu = 2 * tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable)
110 sigma = tf.layers.dense(l1, A_DIM, tf.nn.softplus, trainable=trainable)
111 norm_dist = tf.distributions.Normal(loc=mu, scale=sigma)
112 params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
113 return norm_dist, params
114
115 def choose_action(self, s):
116 s = s[np.newaxis, :]
117 a = self.sess.run(self.sample_op, {self.tfs: s})[0]
118 return np.clip(a, -2, 2)
119
120 def get_v(self, s):
121 if s.ndim < 2: s = s[np.newaxis, :]
122 return self.sess.run(self.v, {self.tfs: s})[0, 0]
123
124 env = gym.make('Pendulum-v0').unwrapped
125 ppo = PPO()
126 all_ep_r = []
127
128 for ep in range(EP_MAX):
129 s = env.reset()
130 buffer_s, buffer_a, buffer_r = [], [], []
131 ep_r = 0
132 for t in range(EP_LEN): # in one episode
133 env.render()
134 a = ppo.choose_action(s)
135 s_, r, done, _ = env.step(a)
136 buffer_s.append(s)
137 buffer_a.append(a)
138 buffer_r.append((r+8)/8) # normalize reward, find to be useful
139 s = s_
140 ep_r += r
141
142 # update ppo
143 if (t+1) % BATCH == 0 or t == EP_LEN-1:
144 v_s_ = ppo.get_v(s_)
145 discounted_r = []
146 for r in buffer_r[::-1]:
147 v_s_ = r + GAMMA * v_s_
148 discounted_r.append(v_s_)
149 discounted_r.reverse()
150
151 bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
152 buffer_s, buffer_a, buffer_r = [], [], []
153 ppo.update(bs, ba, br)
154 if ep == 0: all_ep_r.append(ep_r)
155 else: all_ep_r.append(all_ep_r[-1]*0.9 + ep_r*0.1)
156 print(
157 'Ep: %i' % ep,
158 "|Ep_r: %i" % ep_r,
159 ("|Lam: %.4f" % METHOD['lam']) if METHOD['name'] == 'kl_pen' else '',
160 )
161
162 plt.plot(np.arange(len(all_ep_r)), all_ep_r)
163 plt.xlabel('Episode');plt.ylabel('Moving averaged episode reward');plt.show()
代码DPPO
1 """
2 A simple version of OpenAI's Proximal Policy Optimization (PPO). [https://arxiv.org/abs/1707.06347]
3
4 Distributing workers in parallel to collect data, then stop worker's roll-out and train PPO on collected data.
5 Restart workers once PPO is updated.
6
7 The global PPO updating rule is adopted from DeepMind's paper (DPPO):
8 Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [https://arxiv.org/abs/1707.02286]
9
10 View more on my tutorial website: https://morvanzhou.github.io/tutorials
11
12 Dependencies:
13 tensorflow r1.3
14 gym 0.9.2
15 """
16
17 import tensorflow as tf
18 import numpy as np
19 import matplotlib.pyplot as plt
20 import gym, threading, queue
21
22 EP_MAX = 1000
23 EP_LEN = 200
24 N_WORKER = 4 # parallel workers
25 GAMMA = 0.9 # reward discount factor
26 A_LR = 0.0001 # learning rate for actor
27 C_LR = 0.0002 # learning rate for critic
28 MIN_BATCH_SIZE = 64 # minimum batch size for updating PPO
29 UPDATE_STEP = 10 # loop update operation n-steps
30 EPSILON = 0.2 # for clipping surrogate objective
31 GAME = 'Pendulum-v0'
32 S_DIM, A_DIM = 3, 1 # state and action dimension
33
34
35 class PPO(object):
36 def __init__(self):
37 self.sess = tf.Session()
38 self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')
39
40 # critic
41 l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
42 self.v = tf.layers.dense(l1, 1)
43 self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
44 self.advantage = self.tfdc_r - self.v
45 self.closs = tf.reduce_mean(tf.square(self.advantage))
46 self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)
47
48 # actor
49 pi, pi_params = self._build_anet('pi', trainable=True)
50 oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)
51 self.sample_op = tf.squeeze(pi.sample(1), axis=0) # operation of choosing action
52 self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
53
54 self.tfa = tf.placeholder(tf.float32, [None, A_DIM], 'action')
55 self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
56 # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
57 ratio = pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5)
58 surr = ratio * self.tfadv # surrogate loss
59
60 self.aloss = -tf.reduce_mean(tf.minimum( # clipped surrogate objective
61 surr,
62 tf.clip_by_value(ratio, 1. - EPSILON, 1. + EPSILON) * self.tfadv))
63
64 self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
65 self.sess.run(tf.global_variables_initializer())
66
67 def update(self):
68 global GLOBAL_UPDATE_COUNTER
69 while not COORD.should_stop():
70 if GLOBAL_EP < EP_MAX:
71 UPDATE_EVENT.wait() # wait until get batch of data
72 self.sess.run(self.update_oldpi_op) # copy pi to old pi
73 data = [QUEUE.get() for _ in range(QUEUE.qsize())] # collect data from all workers
74 data = np.vstack(data)
75 s, a, r = data[:, :S_DIM], data[:, S_DIM: S_DIM + A_DIM], data[:, -1:]
76 adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
77 # update actor and critic in a update loop
78 [self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(UPDATE_STEP)]
79 [self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(UPDATE_STEP)]
80 UPDATE_EVENT.clear() # updating finished
81 GLOBAL_UPDATE_COUNTER = 0 # reset counter
82 ROLLING_EVENT.set() # set roll-out available
83
84 def _build_anet(self, name, trainable):
85 with tf.variable_scope(name):
86 l1 = tf.layers.dense(self.tfs, 200, tf.nn.relu, trainable=trainable)
87 mu = 2 * tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable)
88 sigma = tf.layers.dense(l1, A_DIM, tf.nn.softplus, trainable=trainable)
89 norm_dist = tf.distributions.Normal(loc=mu, scale=sigma)
90 params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
91 return norm_dist, params
92
93 def choose_action(self, s):
94 s = s[np.newaxis, :]
95 a = self.sess.run(self.sample_op, {self.tfs: s})[0]
96 return np.clip(a, -2, 2)
97
98 def get_v(self, s):
99 if s.ndim < 2: s = s[np.newaxis, :]
100 return self.sess.run(self.v, {self.tfs: s})[0, 0]
101
102
103 class Worker(object):
104 def __init__(self, wid):
105 self.wid = wid
106 self.env = gym.make(GAME).unwrapped
107 self.ppo = GLOBAL_PPO
108
109 def work(self):
110 global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
111 while not COORD.should_stop():
112 s = self.env.reset()
113 ep_r = 0
114 buffer_s, buffer_a, buffer_r = [], [], []
115 for t in range(EP_LEN):
116 if not ROLLING_EVENT.is_set(): # while global PPO is updating
117 ROLLING_EVENT.wait() # wait until PPO is updated
118 buffer_s, buffer_a, buffer_r = [], [], [] # clear history buffer, use new policy to collect data
119 a = self.ppo.choose_action(s)
120 s_, r, done, _ = self.env.step(a)
121 buffer_s.append(s)
122 buffer_a.append(a)
123 buffer_r.append((r + 8) / 8) # normalize reward, find to be useful
124 s = s_
125 ep_r += r
126
127 GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers
128 if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
129 v_s_ = self.ppo.get_v(s_)
130 discounted_r = [] # compute discounted reward
131 for r in buffer_r[::-1]:
132 v_s_ = r + GAMMA * v_s_
133 discounted_r.append(v_s_)
134 discounted_r.reverse()
135
136 bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
137 buffer_s, buffer_a, buffer_r = [], [], []
138 QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue
139 if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
140 ROLLING_EVENT.clear() # stop collecting data
141 UPDATE_EVENT.set() # globalPPO update
142
143 if GLOBAL_EP >= EP_MAX: # stop training
144 COORD.request_stop()
145 break
146
147 # record reward changes, plot later
148 if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
149 else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1)
150 GLOBAL_EP += 1
151 print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r,)
152
153
154 if __name__ == '__main__':
155 GLOBAL_PPO = PPO()
156 UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event()
157 UPDATE_EVENT.clear() # not update now
158 ROLLING_EVENT.set() # start to roll out
159 workers = [Worker(wid=i) for i in range(N_WORKER)]
160
161 GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0
162 GLOBAL_RUNNING_R = []
163 COORD = tf.train.Coordinator()
164 QUEUE = queue.Queue() # workers putting data in this queue
165 threads = []
166 for worker in workers: # worker threads
167 t = threading.Thread(target=worker.work, args=())
168 t.start() # training
169 threads.append(t)
170 # add a PPO updating thread
171 threads.append(threading.Thread(target=GLOBAL_PPO.update,))
172 threads[-1].start()
173 COORD.join(threads)
174
175 # plot reward change and test
176 plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
177 plt.xlabel('Episode'); plt.ylabel('Moving reward'); plt.ion(); plt.show()
178 env = gym.make('Pendulum-v0')
179 while True:
180 s = env.reset()
181 for t in range(300):
182 env.render()
183 s = env.step(GLOBAL_PPO.choose_action(s))[0]