• 深度增强学习--A3C


    A3C

     它会创建多个并行的环境, 让多个拥有副结构的 agent 同时在这些并行环境上更新主结构中的参数. 并行中的 agent 们互不干扰, 而主结构的参数更新受到副结构提交更新的不连续性干扰, 所以更新的相关性被降低, 收敛性提高

    代码实践

      1 import threading
      2 import numpy as np
      3 import tensorflow as tf
      4 import pylab
      5 import time
      6 import gym
      7 from keras.layers import Dense, Input
      8 from keras.models import Model
      9 from keras.optimizers import Adam
     10 from keras import backend as K
     11 
     12 
     13 # global variables for threading
     14 episode = 0
     15 scores = []
     16 
     17 EPISODES = 2000
     18 
     19 # This is A3C(Asynchronous Advantage Actor Critic) agent(global) for the Cartpole
     20 # In this example, we use A3C algorithm
     21 class A3CAgent:
     22     def __init__(self, state_size, action_size, env_name):
     23         # get size of state and action
     24         self.state_size = state_size
     25         self.action_size = action_size
     26 
     27         # get gym environment name
     28         self.env_name = env_name
     29 
     30         # these are hyper parameters for the A3C
     31         self.actor_lr = 0.001
     32         self.critic_lr = 0.001
     33         self.discount_factor = .99
     34         self.hidden1, self.hidden2 = 24, 24
     35         self.threads = 8  #8个线程并行
     36 
     37         # create model for actor and critic network
     38         self.actor, self.critic = self.build_model()
     39 
     40         # method for training actor and critic network
     41         self.optimizer = [self.actor_optimizer(), self.critic_optimizer()]
     42 
     43         self.sess = tf.InteractiveSession()
     44         K.set_session(self.sess)
     45         self.sess.run(tf.global_variables_initializer())
     46 
     47     # approximate policy and value using Neural Network
     48     # actor -> state is input and probability of each action is output of network
     49     # critic -> state is input and value of state is output of network
     50     # actor and critic network share first hidden layer
     51     def build_model(self):
     52         state = Input(batch_shape=(None,  self.state_size))
     53         shared = Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')(state)
     54 
     55         actor_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform')(shared)
     56         action_prob = Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')(actor_hidden)
     57 
     58         value_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='he_uniform')(shared)
     59         state_value = Dense(1, activation='linear', kernel_initializer='he_uniform')(value_hidden)
     60 
     61         actor = Model(inputs=state, outputs=action_prob)
     62         critic = Model(inputs=state, outputs=state_value)
     63 
     64         actor._make_predict_function()
     65         critic._make_predict_function()
     66 
     67         actor.summary()
     68         critic.summary()
     69 
     70         return actor, critic
     71 
     72     # make loss function for Policy Gradient
     73     # [log(action probability) * advantages] will be input for the back prop
     74     # we add entropy of action probability to loss
     75     def actor_optimizer(self):
     76         action = K.placeholder(shape=(None, self.action_size))
     77         advantages = K.placeholder(shape=(None, ))
     78 
     79         policy = self.actor.output
     80 
     81         good_prob = K.sum(action * policy, axis=1)
     82         eligibility = K.log(good_prob + 1e-10) * K.stop_gradient(advantages)
     83         loss = -K.sum(eligibility)
     84 
     85         entropy = K.sum(policy * K.log(policy + 1e-10), axis=1)
     86 
     87         actor_loss = loss + 0.01*entropy
     88 
     89         optimizer = Adam(lr=self.actor_lr)
     90         updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss)
     91         train = K.function([self.actor.input, action, advantages], [], updates=updates)
     92         return train
     93 
     94     # make loss function for Value approximation
     95     def critic_optimizer(self):
     96         discounted_reward = K.placeholder(shape=(None, ))
     97 
     98         value = self.critic.output
     99 
    100         loss = K.mean(K.square(discounted_reward - value))
    101 
    102         optimizer = Adam(lr=self.critic_lr)
    103         updates = optimizer.get_updates(self.critic.trainable_weights, [], loss)
    104         train = K.function([self.critic.input, discounted_reward], [], updates=updates)
    105         return train
    106 
    107     # make agents(local) and start training
    108     def train(self):
    109         # self.load_model('./save_model/cartpole_a3c.h5')
    110         agents = [Agent(i, self.actor, self.critic, self.optimizer, self.env_name, self.discount_factor,
    111                         self.action_size, self.state_size) for i in range(self.threads)]#建立8个local agent
    112 
    113         for agent in agents:
    114             agent.start()
    115 
    116         while True:
    117             time.sleep(20)
    118 
    119             plot = scores[:]
    120             pylab.plot(range(len(plot)), plot, 'b')
    121             pylab.savefig("./save_graph/cartpole_a3c.png")
    122 
    123             self.save_model('./save_model/cartpole_a3c.h5')
    124 
    125     def save_model(self, name):
    126         self.actor.save_weights(name + "_actor.h5")
    127         self.critic.save_weights(name + "_critic.h5")
    128 
    129     def load_model(self, name):
    130         self.actor.load_weights(name + "_actor.h5")
    131         self.critic.load_weights(name + "_critic.h5")
    132 
    133 # This is Agent(local) class for threading
    134 class Agent(threading.Thread):
    135     def __init__(self, index, actor, critic, optimizer, env_name, discount_factor, action_size, state_size):
    136         threading.Thread.__init__(self)
    137 
    138         self.states = []
    139         self.rewards = []
    140         self.actions = []
    141 
    142         self.index = index
    143         self.actor = actor
    144         self.critic = critic
    145         self.optimizer = optimizer
    146         self.env_name = env_name
    147         self.discount_factor = discount_factor
    148         self.action_size = action_size
    149         self.state_size = state_size
    150 
    151     # Thread interactive with environment
    152     def run(self):
    153         global episode
    154         env = gym.make(self.env_name)
    155         while episode < EPISODES:
    156             state = env.reset()
    157             score = 0
    158             while True:
    159                 action = self.get_action(state)
    160                 next_state, reward, done, _ = env.step(action)
    161                 score += reward
    162 
    163                 self.memory(state, action, reward)
    164 
    165                 state = next_state
    166 
    167                 if done:
    168                     episode += 1
    169                     print("episode: ", episode, "/ score : ", score)
    170                     scores.append(score)
    171                     self.train_episode(score != 500)
    172                     break
    173 
    174     # In Policy Gradient, Q function is not available.
    175     # Instead agent uses sample returns for evaluating policy
    176     def discount_rewards(self, rewards, done=True):
    177         discounted_rewards = np.zeros_like(rewards)
    178         running_add = 0
    179         if not done:
    180             running_add = self.critic.predict(np.reshape(self.states[-1], (1, self.state_size)))[0]
    181         for t in reversed(range(0, len(rewards))):
    182             running_add = running_add * self.discount_factor + rewards[t]
    183             discounted_rewards[t] = running_add
    184         return discounted_rewards
    185 
    186     # save <s, a ,r> of each step
    187     # this is used for calculating discounted rewards
    188     def memory(self, state, action, reward):
    189         self.states.append(state)
    190         act = np.zeros(self.action_size)
    191         act[action] = 1
    192         self.actions.append(act)
    193         self.rewards.append(reward)
    194 
    195     # update policy network and value network every episode
    196     def train_episode(self, done):
    197         discounted_rewards = self.discount_rewards(self.rewards, done)
    198 
    199         values = self.critic.predict(np.array(self.states))
    200         values = np.reshape(values, len(values))
    201 
    202         advantages = discounted_rewards - values
    203 
    204         self.optimizer[0]([self.states, self.actions, advantages])
    205         self.optimizer[1]([self.states, discounted_rewards])
    206         self.states, self.actions, self.rewards = [], [], []
    207 
    208     def get_action(self, state):
    209         policy = self.actor.predict(np.reshape(state, [1, self.state_size]))[0]
    210         return np.random.choice(self.action_size, 1, p=policy)[0]
    211 
    212 
    213 if __name__ == "__main__":
    214     env_name = 'CartPole-v1'
    215     env = gym.make(env_name)
    216 
    217     state_size = env.observation_space.shape[0]
    218     action_size = env.action_space.n
    219 
    220     env.close()
    221 
    222     global_agent = A3CAgent(state_size, action_size, env_name)
    223     global_agent.train()
  • 相关阅读:
    我的IT之路2011(二)
    用三层架构画类图的基本结构
    计算机教育学术交流会
    我的IT之路2011(一)
    Canvas中元素的定位
    对话框中的数据绑定(WPF)
    {Binding}详释 (WPF)
    Polyline的问题
    How do I sort groups of data items?(WPF)
    Excel Services OverView系列--3使用Excel Web Services操作Excel工作薄
  • 原文地址:https://www.cnblogs.com/buyizhiyou/p/10251921.html
Copyright © 2020-2023  润新知