• 深度增强学习--DPPO


    PPO

    DPPO介绍

    PPO实现

      1 """
      2 A simple version of Proximal Policy Optimization (PPO) using single thread.
      3 
      4 Based on:
      5 1. Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [https://arxiv.org/abs/1707.02286]
      6 2. Proximal Policy Optimization Algorithms (OpenAI): [https://arxiv.org/abs/1707.06347]
      7 
      8 View more on my tutorial website: https://morvanzhou.github.io/tutorials
      9 
     10 Dependencies:
     11 tensorflow r1.2
     12 gym 0.9.2
     13 """
     14 
     15 import tensorflow as tf
     16 import numpy as np
     17 import matplotlib.pyplot as plt
     18 import gym
     19 
     20 EP_MAX = 1000
     21 EP_LEN = 200
     22 GAMMA = 0.9
     23 A_LR = 0.0001
     24 C_LR = 0.0002
     25 BATCH = 32
     26 A_UPDATE_STEPS = 10
     27 C_UPDATE_STEPS = 10
     28 S_DIM, A_DIM = 3, 1
     29 METHOD = [
     30     dict(name='kl_pen', kl_target=0.01, lam=0.5),   # KL penalty
     31     dict(name='clip', epsilon=0.2),                 # Clipped surrogate objective, find this is better
     32 ][1]        # choose the method for optimization
     33 
     34 
     35 class PPO(object):
     36 
     37     def __init__(self):
     38         self.sess = tf.Session()
     39         self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')
     40 
     41         # critic
     42         with tf.variable_scope('critic'):
     43             l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
     44             self.v = tf.layers.dense(l1, 1)
     45             self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
     46             self.advantage = self.tfdc_r - self.v
     47             self.closs = tf.reduce_mean(tf.square(self.advantage))
     48             self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)
     49 
     50         # actor
     51         pi, pi_params = self._build_anet('pi', trainable=True)
     52         oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)
     53         with tf.variable_scope('sample_action'):
     54             self.sample_op = tf.squeeze(pi.sample(1), axis=0)       # choosing action
     55         with tf.variable_scope('update_oldpi'):
     56             self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
     57 
     58         self.tfa = tf.placeholder(tf.float32, [None, A_DIM], 'action')
     59         self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
     60         with tf.variable_scope('loss'):
     61             with tf.variable_scope('surrogate'):
     62                 # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
     63                 ratio = pi.prob(self.tfa) / oldpi.prob(self.tfa)
     64                 surr = ratio * self.tfadv
     65             if METHOD['name'] == 'kl_pen':
     66                 self.tflam = tf.placeholder(tf.float32, None, 'lambda')
     67                 kl = tf.distributions.kl_divergence(oldpi, pi)
     68                 self.kl_mean = tf.reduce_mean(kl)
     69                 self.aloss = -(tf.reduce_mean(surr - self.tflam * kl))
     70             else:   # clipping method, find this is better
     71                 self.aloss = -tf.reduce_mean(tf.minimum(
     72                     surr,
     73                     tf.clip_by_value(ratio, 1.-METHOD['epsilon'], 1.+METHOD['epsilon'])*self.tfadv))
     74 
     75         with tf.variable_scope('atrain'):
     76             self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
     77 
     78         tf.summary.FileWriter("log/", self.sess.graph)
     79 
     80         self.sess.run(tf.global_variables_initializer())
     81 
     82     def update(self, s, a, r):
     83         self.sess.run(self.update_oldpi_op)
     84         adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
     85         # adv = (adv - adv.mean())/(adv.std()+1e-6)     # sometimes helpful
     86 
     87         # update actor
     88         if METHOD['name'] == 'kl_pen':
     89             for _ in range(A_UPDATE_STEPS):
     90                 _, kl = self.sess.run(
     91                     [self.atrain_op, self.kl_mean],
     92                     {self.tfs: s, self.tfa: a, self.tfadv: adv, self.tflam: METHOD['lam']})
     93                 if kl > 4*METHOD['kl_target']:  # this in in google's paper
     94                     break
     95             if kl < METHOD['kl_target'] / 1.5:  # adaptive lambda, this is in OpenAI's paper
     96                 METHOD['lam'] /= 2
     97             elif kl > METHOD['kl_target'] * 1.5:
     98                 METHOD['lam'] *= 2
     99             METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10)    # sometimes explode, this clipping is my solution
    100         else:   # clipping method, find this is better (OpenAI's paper)
    101             [self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(A_UPDATE_STEPS)]
    102 
    103         # update critic
    104         [self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(C_UPDATE_STEPS)]
    105 
    106     def _build_anet(self, name, trainable):
    107         with tf.variable_scope(name):
    108             l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu, trainable=trainable)
    109             mu = 2 * tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable)
    110             sigma = tf.layers.dense(l1, A_DIM, tf.nn.softplus, trainable=trainable)
    111             norm_dist = tf.distributions.Normal(loc=mu, scale=sigma)
    112         params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
    113         return norm_dist, params
    114 
    115     def choose_action(self, s):
    116         s = s[np.newaxis, :]
    117         a = self.sess.run(self.sample_op, {self.tfs: s})[0]
    118         return np.clip(a, -2, 2)
    119 
    120     def get_v(self, s):
    121         if s.ndim < 2: s = s[np.newaxis, :]
    122         return self.sess.run(self.v, {self.tfs: s})[0, 0]
    123 
    124 env = gym.make('Pendulum-v0').unwrapped
    125 ppo = PPO()
    126 all_ep_r = []
    127 
    128 for ep in range(EP_MAX):
    129     s = env.reset()
    130     buffer_s, buffer_a, buffer_r = [], [], []
    131     ep_r = 0
    132     for t in range(EP_LEN):    # in one episode
    133         env.render()
    134         a = ppo.choose_action(s)
    135         s_, r, done, _ = env.step(a)
    136         buffer_s.append(s)
    137         buffer_a.append(a)
    138         buffer_r.append((r+8)/8)    # normalize reward, find to be useful
    139         s = s_
    140         ep_r += r
    141 
    142         # update ppo
    143         if (t+1) % BATCH == 0 or t == EP_LEN-1:
    144             v_s_ = ppo.get_v(s_)
    145             discounted_r = []
    146             for r in buffer_r[::-1]:
    147                 v_s_ = r + GAMMA * v_s_
    148                 discounted_r.append(v_s_)
    149             discounted_r.reverse()
    150 
    151             bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
    152             buffer_s, buffer_a, buffer_r = [], [], []
    153             ppo.update(bs, ba, br)
    154     if ep == 0: all_ep_r.append(ep_r)
    155     else: all_ep_r.append(all_ep_r[-1]*0.9 + ep_r*0.1)
    156     print(
    157         'Ep: %i' % ep,
    158         "|Ep_r: %i" % ep_r,
    159         ("|Lam: %.4f" % METHOD['lam']) if METHOD['name'] == 'kl_pen' else '',
    160     )
    161 
    162 plt.plot(np.arange(len(all_ep_r)), all_ep_r)
    163 plt.xlabel('Episode');plt.ylabel('Moving averaged episode reward');plt.show()

    代码DPPO

      1 """
      2 A simple version of OpenAI's Proximal Policy Optimization (PPO). [https://arxiv.org/abs/1707.06347]
      3 
      4 Distributing workers in parallel to collect data, then stop worker's roll-out and train PPO on collected data.
      5 Restart workers once PPO is updated.
      6 
      7 The global PPO updating rule is adopted from DeepMind's paper (DPPO):
      8 Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [https://arxiv.org/abs/1707.02286]
      9 
     10 View more on my tutorial website: https://morvanzhou.github.io/tutorials
     11 
     12 Dependencies:
     13 tensorflow r1.3
     14 gym 0.9.2
     15 """
     16 
     17 import tensorflow as tf
     18 import numpy as np
     19 import matplotlib.pyplot as plt
     20 import gym, threading, queue
     21 
     22 EP_MAX = 1000
     23 EP_LEN = 200
     24 N_WORKER = 4                # parallel workers
     25 GAMMA = 0.9                 # reward discount factor
     26 A_LR = 0.0001               # learning rate for actor
     27 C_LR = 0.0002               # learning rate for critic
     28 MIN_BATCH_SIZE = 64         # minimum batch size for updating PPO
     29 UPDATE_STEP = 10            # loop update operation n-steps
     30 EPSILON = 0.2               # for clipping surrogate objective
     31 GAME = 'Pendulum-v0'
     32 S_DIM, A_DIM = 3, 1         # state and action dimension
     33 
     34 
     35 class PPO(object):
     36     def __init__(self):
     37         self.sess = tf.Session()
     38         self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')
     39 
     40         # critic
     41         l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
     42         self.v = tf.layers.dense(l1, 1)
     43         self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
     44         self.advantage = self.tfdc_r - self.v
     45         self.closs = tf.reduce_mean(tf.square(self.advantage))
     46         self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)
     47 
     48         # actor
     49         pi, pi_params = self._build_anet('pi', trainable=True)
     50         oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)
     51         self.sample_op = tf.squeeze(pi.sample(1), axis=0)  # operation of choosing action
     52         self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
     53 
     54         self.tfa = tf.placeholder(tf.float32, [None, A_DIM], 'action')
     55         self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
     56         # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
     57         ratio = pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5)
     58         surr = ratio * self.tfadv                       # surrogate loss
     59 
     60         self.aloss = -tf.reduce_mean(tf.minimum(        # clipped surrogate objective
     61             surr,
     62             tf.clip_by_value(ratio, 1. - EPSILON, 1. + EPSILON) * self.tfadv))
     63 
     64         self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
     65         self.sess.run(tf.global_variables_initializer())
     66 
     67     def update(self):
     68         global GLOBAL_UPDATE_COUNTER
     69         while not COORD.should_stop():
     70             if GLOBAL_EP < EP_MAX:
     71                 UPDATE_EVENT.wait()                     # wait until get batch of data
     72                 self.sess.run(self.update_oldpi_op)     # copy pi to old pi
     73                 data = [QUEUE.get() for _ in range(QUEUE.qsize())]      # collect data from all workers
     74                 data = np.vstack(data)
     75                 s, a, r = data[:, :S_DIM], data[:, S_DIM: S_DIM + A_DIM], data[:, -1:]
     76                 adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
     77                 # update actor and critic in a update loop
     78                 [self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(UPDATE_STEP)]
     79                 [self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(UPDATE_STEP)]
     80                 UPDATE_EVENT.clear()        # updating finished
     81                 GLOBAL_UPDATE_COUNTER = 0   # reset counter
     82                 ROLLING_EVENT.set()         # set roll-out available
     83 
     84     def _build_anet(self, name, trainable):
     85         with tf.variable_scope(name):
     86             l1 = tf.layers.dense(self.tfs, 200, tf.nn.relu, trainable=trainable)
     87             mu = 2 * tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable)
     88             sigma = tf.layers.dense(l1, A_DIM, tf.nn.softplus, trainable=trainable)
     89             norm_dist = tf.distributions.Normal(loc=mu, scale=sigma)
     90         params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
     91         return norm_dist, params
     92 
     93     def choose_action(self, s):
     94         s = s[np.newaxis, :]
     95         a = self.sess.run(self.sample_op, {self.tfs: s})[0]
     96         return np.clip(a, -2, 2)
     97 
     98     def get_v(self, s):
     99         if s.ndim < 2: s = s[np.newaxis, :]
    100         return self.sess.run(self.v, {self.tfs: s})[0, 0]
    101 
    102 
    103 class Worker(object):
    104     def __init__(self, wid):
    105         self.wid = wid
    106         self.env = gym.make(GAME).unwrapped
    107         self.ppo = GLOBAL_PPO
    108 
    109     def work(self):
    110         global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
    111         while not COORD.should_stop():
    112             s = self.env.reset()
    113             ep_r = 0
    114             buffer_s, buffer_a, buffer_r = [], [], []
    115             for t in range(EP_LEN):
    116                 if not ROLLING_EVENT.is_set():                  # while global PPO is updating
    117                     ROLLING_EVENT.wait()                        # wait until PPO is updated
    118                     buffer_s, buffer_a, buffer_r = [], [], []   # clear history buffer, use new policy to collect data
    119                 a = self.ppo.choose_action(s)
    120                 s_, r, done, _ = self.env.step(a)
    121                 buffer_s.append(s)
    122                 buffer_a.append(a)
    123                 buffer_r.append((r + 8) / 8)                    # normalize reward, find to be useful
    124                 s = s_
    125                 ep_r += r
    126 
    127                 GLOBAL_UPDATE_COUNTER += 1                      # count to minimum batch size, no need to wait other workers
    128                 if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
    129                     v_s_ = self.ppo.get_v(s_)
    130                     discounted_r = []                           # compute discounted reward
    131                     for r in buffer_r[::-1]:
    132                         v_s_ = r + GAMMA * v_s_
    133                         discounted_r.append(v_s_)
    134                     discounted_r.reverse()
    135 
    136                     bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
    137                     buffer_s, buffer_a, buffer_r = [], [], []
    138                     QUEUE.put(np.hstack((bs, ba, br)))          # put data in the queue
    139                     if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
    140                         ROLLING_EVENT.clear()       # stop collecting data
    141                         UPDATE_EVENT.set()          # globalPPO update
    142 
    143                     if GLOBAL_EP >= EP_MAX:         # stop training
    144                         COORD.request_stop()
    145                         break
    146 
    147             # record reward changes, plot later
    148             if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
    149             else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1)
    150             GLOBAL_EP += 1
    151             print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid,  '|Ep_r: %.2f' % ep_r,)
    152 
    153 
    154 if __name__ == '__main__':
    155     GLOBAL_PPO = PPO()
    156     UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event()
    157     UPDATE_EVENT.clear()            # not update now
    158     ROLLING_EVENT.set()             # start to roll out
    159     workers = [Worker(wid=i) for i in range(N_WORKER)]
    160     
    161     GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0
    162     GLOBAL_RUNNING_R = []
    163     COORD = tf.train.Coordinator()
    164     QUEUE = queue.Queue()           # workers putting data in this queue
    165     threads = []
    166     for worker in workers:          # worker threads
    167         t = threading.Thread(target=worker.work, args=())
    168         t.start()                   # training
    169         threads.append(t)
    170     # add a PPO updating thread
    171     threads.append(threading.Thread(target=GLOBAL_PPO.update,))
    172     threads[-1].start()
    173     COORD.join(threads)
    174 
    175     # plot reward change and test
    176     plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
    177     plt.xlabel('Episode'); plt.ylabel('Moving reward'); plt.ion(); plt.show()
    178     env = gym.make('Pendulum-v0')
    179     while True:
    180         s = env.reset()
    181         for t in range(300):
    182             env.render()
    183             s = env.step(GLOBAL_PPO.choose_action(s))[0]
  • 相关阅读:
    mount error(12): Cannot allocate memory解决办法
    九死一生,技术人创业需要哪些前期准备?
    Wireshark抓包常见问题解析
    windows服务和进程的区别和联系
    Daemon Process
    C++11中的原子操作(atomic operation)
    Neo4j Cypher语法(一)
    UltraEdit快捷键大全 UltraEdit常用快捷键大全
    mysql查询优化之三:查询优化器提示(hint)
    如何解决tomcat中的应用报java.io.IOException: 您的主机中的软件中止了一个已建立的连接。
  • 原文地址:https://www.cnblogs.com/buyizhiyou/p/10251933.html
Copyright © 2020-2023  润新知