• 强化学习--DDPG---tensorflow实现


    完整代码:https://github.com/zle1992/Reinforcement_Learning_Game

    论文《Continuous control with deep reinforcement learning》https://arxiv.org/pdf/1509.02971.pdf

    Deep_Deterministic_Policy_Gradient

     DDPG与AC的区别:

    AC:

      Actor: 利用td_error更新参数,td_error 来自Critic

      Critic:根据value(s)函数的贝尔曼方程更新梯度

    DDPG:

      Actor:   maximize the q,输出action

      Critic:根据Q(s,a)函数的贝尔曼方程更新梯度, 输出q值

    DDPG 只能预测连续的动作输出。

    逻辑梳理:

    1、DDPG是AC 模型,输入包括(S,R,S_,A)

    2、Actor 

    intput:(S)

    output: a

    loss :max(q) 

    q 来自Critic

    3、Critic

    input : S 、A

    output: q

    loss: R+ GAMMA * q_  - q

    问题来了,q_ how to get? ---->Critic网络可以输入(S_,a_)得到q_ 但是,不能用同一个网络啊,所以,利用错位时间,我们使用Critic2(不可训练的)

    Critic2需要a_ how to get?/----->Action网络可以输出(S_)得到a_,同理,我们使用Actor2(不可训练的)得到a_

    流程

    a = actor(s ,trian)

    a_ = actor(s_,not_train)

    q  = critic(s,a  trian)

    q_critic(s_,a_,not_train)

    a_loss = max(q)

    c_loss = RGAMMA * q_  - q

     

    代码:

    DDPY.py

      1 import os
      2 import numpy as np 
      3 import tensorflow as tf
      4 from abc import ABCMeta, abstractmethod
      5 np.random.seed(1)
      6 tf.set_random_seed(1)
      7 
      8 import logging  # 引入logging模块
      9 logging.basicConfig(level=logging.DEBUG,
     10                     format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')  # logging.basicConfig函数对日志的输出格式及方式做相关配置
     11 # 由于日志基本配置中级别设置为DEBUG,所以一下打印信息将会全部显示在控制台上
     12 
     13 tfconfig = tf.ConfigProto()
     14 tfconfig.gpu_options.allow_growth = True
     15 session = tf.Session(config=tfconfig)
     16 
     17 
     18 class DDPG(object):
     19     __metaclass__ = ABCMeta
     20     """docstring for ACNetwork"""
     21     def __init__(self, 
     22             n_actions,
     23             n_features,
     24             reward_decay,
     25             lr_a,
     26             lr_c,
     27             memory_size,
     28             output_graph,
     29             log_dir,
     30             model_dir,
     31             TAU,
     32             a_bound,
     33             ):
     34         super(DDPG, self).__init__()
     35         
     36         self.n_actions = n_actions
     37         self.n_features = n_features
     38         self.gamma=reward_decay
     39         self.memory_size =memory_size
     40         self.output_graph=output_graph
     41         self.lr_a =lr_a
     42         self.lr_c = lr_c
     43         self.log_dir = log_dir
     44     
     45         self.model_dir = model_dir 
     46         # total learning step
     47         self.learn_step_counter = 0
     48         self.TAU = TAU     # soft replacement
     49         self.a_bound = a_bound
     50 
     51 
     52 
     53 
     54         self.s = tf.placeholder(tf.float32,[None]+self.n_features,name='s')
     55         self.s_next = tf.placeholder(tf.float32,[None]+self.n_features,name='s_next')
     56         self.r = tf.placeholder(tf.float32,[None,],name='r')
     57 
     58         #self.a = tf.placeholder(tf.int32,[None,1],name='a')
     59 
     60         with tf.variable_scope('Actor'):
     61             self.a = self._build_a_net(self.s, scope='eval', trainable=True)
     62             a_ = self._build_a_net(self.s_next, scope='target', trainable=False)
     63 
     64         with tf.variable_scope('Critic'):
     65 
     66             q  = self._build_c_net(self.s, self.a,scope='eval', trainable=True)
     67             q_  = self._build_c_net(self.s_next, a_,scope='target', trainable=False)
     68 
     69         # networks parameters
     70         self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
     71         self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
     72         self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
     73         self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
     74 
     75        
     76           
     77         
     78         with tf.variable_scope('train_op_actor'):
     79             self.loss_actor = -tf.reduce_mean(q)
     80             self.train_op_actor = tf.train.AdamOptimizer(self.lr_a).minimize(self.loss_actor,var_list=self.ae_params)  
     81     
     82     
     83 
     84         
     85         with tf.variable_scope('train_op_critic'):
     86             
     87             q_target = self.r + self.gamma * q_ 
     88             self.loss_critic =tf.losses.mean_squared_error(labels=q_target, predictions=q)
     89             self.train_op_critic = tf.train.AdamOptimizer(self.lr_c).minimize(self.loss_critic,var_list=self.ce_params)
     90 
     91        
     92 
     93             # target net replacement
     94         self.soft_replace = [tf.assign(t, (1 - self.TAU) * t + self.TAU * e)
     95                                for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]
     96        
     97         self.sess = tf.Session()
     98         if self.output_graph:
     99             tf.summary.FileWriter(self.log_dir,self.sess.graph)
    100 
    101         self.sess.run(tf.global_variables_initializer())
    102         
    103         self.cost_his =[0]
    104         self.cost =0 
    105 
    106         self.saver = tf.train.Saver()
    107 
    108         if not os.path.exists(self.model_dir):
    109             os.mkdir(self.model_dir)
    110 
    111         checkpoint = tf.train.get_checkpoint_state(self.model_dir)
    112         if checkpoint and checkpoint.model_checkpoint_path:
    113             self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
    114             print ("Loading Successfully")
    115             self.learn_step_counter = int(checkpoint.model_checkpoint_path.split('-')[-1]) + 1
    116    
    117 
    118     @abstractmethod
    119     def _build_a_net(self,x,scope,trainable):
    120 
    121         raise NotImplementedError
    122     def _build_c_net(self,x,scope,trainable):
    123 
    124         raise NotImplementedError
    125     def learn(self,data):
    126 
    127         # soft target replacement
    128         self.sess.run(self.soft_replace)
    129        
    130 
    131         batch_memory_s = data['s']
    132         batch_memory_a =  data['a']
    133         batch_memory_r = data['r']
    134         batch_memory_s_ = data['s_']
    135       
    136         _, cost = self.sess.run(
    137             [self.train_op_actor, self.loss_actor],
    138             feed_dict={
    139                 self.s: batch_memory_s,                      
    140             })
    141 
    142         _, cost = self.sess.run(
    143             [self.train_op_critic, self.loss_critic],
    144             feed_dict={
    145                 self.s: batch_memory_s,
    146                 self.a: batch_memory_a,
    147                 self.r: batch_memory_r,
    148                 self.s_next: batch_memory_s_,
    149            
    150             })
    151 
    152 
    153         
    154         self.cost_his.append(cost)
    155         self.cost =cost
    156         self.learn_step_counter += 1
    157             # save network every 100000 iteration
    158         if self.learn_step_counter % 10000 == 0:
    159             self.saver.save(self.sess,self.model_dir,global_step=self.learn_step_counter)
    160 
    161 
    162 
    163     def choose_action(self,s): 
    164 
    165         return self.sess.run(self.a, {self.s: s[np.newaxis,:]})[0]
    166         # s = s[np.newaxis,:]
    167        
    168         # probs = self.sess.run(self.acts_prob,feed_dict={self.s:s})
    169         # return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())   

    game.py

      1 import sys
      2 import gym
      3 import numpy as np 
      4 import tensorflow as tf
      5 sys.path.append('./')
      6 sys.path.append('model')
      7 
      8 from util import Memory ,StateProcessor
      9 from DDPG import DDPG
     10 from ACNetwork import ACNetwork
     11 np.random.seed(1)
     12 tf.set_random_seed(1)
     13 
     14 import logging  # 引入logging模块
     15 logging.basicConfig(level=logging.DEBUG,
     16                     format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')  # logging.basicConfig函数对日志的输出格式及方式做相关配置
     17 # 由于日志基本配置中级别设置为DEBUG,所以一下打印信息将会全部显示在控制台上
     18 import os
     19 os.environ["CUDA_VISIBLE_DEVICES"] = "1"
     20 tfconfig = tf.ConfigProto()
     21 tfconfig.gpu_options.allow_growth = True
     22 session = tf.Session(config=tfconfig)
     23 
     24 
     25 
     26 class DDPG4Pendulum(DDPG):
     27     """docstring for ClassName"""
     28     def __init__(self, **kwargs):
     29         super(DDPG4Pendulum, self).__init__(**kwargs)
     30     
     31     def _build_a_net(self,s,scope,trainable):
     32         w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)
     33         #w_initializer, b_initializer = None,None
     34         with tf.variable_scope(scope):
     35             e1 = tf.layers.dense(inputs=s, 
     36                     units=30, 
     37                     bias_initializer = b_initializer,
     38                     kernel_initializer=w_initializer,
     39                     activation = tf.nn.relu,
     40                     trainable=trainable)  
     41             a = tf.layers.dense(inputs=e1, 
     42                     units=self.n_actions, 
     43                     bias_initializer = b_initializer,
     44                     kernel_initializer=w_initializer,
     45                     activation = tf.nn.tanh,
     46                     trainable=trainable) 
     47 
     48         return tf.multiply(a, self.a_bound, name='scaled_a')  
     49     
     50     def _build_c_net(self,s,a,scope,trainable):
     51         w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)
     52 
     53         with tf.variable_scope(scope):
     54             n_l1 = 30
     55             w1_s = tf.get_variable('w1_s',self.n_features+[n_l1],trainable=trainable)
     56             w1_a = tf.get_variable('w1_a',[self.n_actions,n_l1],trainable=trainable)
     57             b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
     58             net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
     59             
     60             q = tf.layers.dense(inputs=net, 
     61                     units=1, 
     62                     bias_initializer = b_initializer,
     63                     kernel_initializer=w_initializer,
     64                     activation =None,
     65                     trainable=trainable) 
     66 
     67         return q   
     68 
     69 
     70 
     71 batch_size = 32
     72 
     73 memory_size  =10000
     74 env = gym.make('Pendulum-v0') #连续
     75 
     76 
     77 n_features= [env.observation_space.shape[0]]
     78 n_actions= env.action_space.shape[0]
     79 a_bound = env.action_space.high
     80 env = env.unwrapped
     81 MAX_EP_STEPS =200
     82 def run():
     83    
     84     RL = DDPG4Pendulum(
     85         n_actions=n_actions,
     86         n_features=n_features,
     87         reward_decay=0.9,
     88         lr_a = 0.001,
     89         lr_c = 0.002,
     90         memory_size=memory_size,
     91         TAU = 0.01,
     92         output_graph=False,
     93         log_dir = 'Pendulum/log/DDPG4Pendulum/',
     94         a_bound =a_bound,
     95         model_dir = 'Pendulum/model_dir/DDPG4Pendulum/'
     96         )
     97 
     98     memory = Memory(n_actions,n_features,memory_size=memory_size)
     99   
    100     var = 3  # control exploration
    101     step = 0
    102     
    103     for episode in range(2000):
    104         # initial observation
    105         observation = env.reset()
    106         ep_r = 0
    107 
    108         for j in range(MAX_EP_STEPS):
    109             
    110             # RL choose action based on observation
    111             action = RL.choose_action(observation)
    112             action = np.clip(np.random.normal(action, var), -2, 2)    # add randomness to action selection for exploration
    113             # RL take action and get_collectiot next observation and reward
    114             observation_, reward, done, info=env.step(action) # take a random action
    115            
    116             #print('step:%d---episode:%d----reward:%f---action:%f'%(step,episode,reward,action))
    117             memory.store_transition(observation, action, reward/10, observation_)
    118             
    119             if step > memory_size:
    120                 #env.render()
    121                 var *= .9995    # decay the action randomness
    122                 data = memory.sample(batch_size)
    123                 RL.learn(data)
    124                
    125             # swap observation
    126             observation = observation_
    127             ep_r += reward
    128             # break while loop when end of this episode
    129             if(episode>200): 
    130                 env.render()  # render on the screen
    131             if j == MAX_EP_STEPS-1:
    132                 print('step: ',step,
    133                     'episode: ', episode,
    134                       'ep_r: ', round(ep_r, 2),     
    135                       'var:',var,   
    136                       #loss: ',RL.cost
    137                       )
    138                 break
    139             step += 1
    140 
    141     # end of game
    142     print('game over')
    143     env.destroy()
    144 
    145 def main():
    146  
    147     run()
    148 
    149 
    150 
    151 if __name__ == '__main__':
    152     main()
    153     #run2()
  • 相关阅读:
    Unity3d之UGUI- Image拦截Button响应事件
    hdu1114Piggy-Bank(完全背包)
    cf455A boredom
    hdu2084数塔(入门级dp)
    小米线刷出现remote: partition table doesn't exist
    km算法
    hdu1069 Girls and Boys
    CF1203C Common Divisors
    魔法部落
    java命令行运行出现找不到或无法加载类
  • 原文地址:https://www.cnblogs.com/zle1992/p/10247326.html
Copyright © 2020-2023  润新知