• 基于深度学习,tensorflow框架的使用DQN训练AI玩游戏之笨鸟先飞游戏


    标题起得有点乱,凑活看吧

    今天我跑了一下tensorflow的实验17,先给大家看下效果:

     可能是由于笔记本硬件配置太差了,运存只有4G,显卡也差的要死,总之跑起来的结果不太如人意。

    但是能跑,效果差点而已。

    下面是过程:

    首先要配置环境;

    python,Anaconda3,tensorflow框架,这些首先要搭建好,

    代码部分:

    主代码:

    # -------------------------
    # Project: Deep Q-Learning on Flappy Bird
    # Author: Flood Sung
    # Date: 2016.3.21
    # -------------------------

    import cv2
    import sys
    sys.path.append("game/")
    import wrapped_flappy_bird as game
    from BrainDQN_Nature import BrainDQN
    import numpy as np

    # preprocess raw image to 80*80 gray image
    def preprocess(observation):
    observation = cv2.cvtColor(cv2.resize(observation, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, observation = cv2.threshold(observation,1,255,cv2.THRESH_BINARY)
    return np.reshape(observation,(80,80,1))

    def playFlappyBird():
    # Step 1: init BrainDQN
    actions = 2
    brain = BrainDQN(actions)
    # Step 2: init Flappy Bird Game
    flappyBird = game.GameState()
    # Step 3: play game
    # Step 3.1: obtain init state
    action0 = np.array([1,0]) # do nothing
    observation0, reward0, terminal = flappyBird.frame_step(action0)
    observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, observation0 = cv2.threshold(observation0,1,255,cv2.THRESH_BINARY)
    brain.setInitState(observation0)

    # Step 3.2: run the game
    while 1!= 0:
    action = brain.getAction()
    nextObservation,reward,terminal = flappyBird.frame_step(action)
    nextObservation = preprocess(nextObservation)
    brain.setPerception(nextObservation,action,reward,terminal)

    def main():
    playFlappyBird()

    if __name__ == '__main__':
    main()



    模块1代码:
    # -----------------------------
    # File: Deep Q-Learning Algorithm
    # Author: Flood Sung
    # Date: 2016.3.21
    # -----------------------------

    import tensorflow._api.v2.compat.v1 as tf
    tf.disable_v2_behavior()
    import numpy as np
    import random
    from collections import deque

    # Hyper Parameters:
    FRAME_PER_ACTION = 1
    GAMMA = 0.99 # decay rate of past observations
    OBSERVE = 100. # timesteps to observe before training
    EXPLORE = 200000. # frames over which to anneal epsilon
    FINAL_EPSILON = 0#0.001 # final value of epsilon
    INITIAL_EPSILON = 0#0.01 # starting value of epsilon
    REPLAY_MEMORY = 50000 # number of previous transitions to remember
    BATCH_SIZE = 32 # size of minibatch
    UPDATE_TIME = 100

    try:
    tf.mul
    except:
    # For new version of tensorflow
    # tf.mul has been removed in new version of tensorflow
    # Using tf.multiply to replace tf.mul
    tf.mul = tf.multiply

    class BrainDQN:

    def __init__(self,actions):
    # init replay memory
    self.replayMemory = deque()
    # init some parameters
    self.timeStep = 0
    self.epsilon = INITIAL_EPSILON
    self.actions = actions
    # init Q network
    self.stateInput,self.QValue,self.W_conv1,self.b_conv1,self.W_conv2,self.b_conv2,self.W_conv3,self.b_conv3,self.W_fc1,self.b_fc1,self.W_fc2,self.b_fc2 = self.createQNetwork()

    # init Target Q Network
    self.stateInputT,self.QValueT,self.W_conv1T,self.b_conv1T,self.W_conv2T,self.b_conv2T,self.W_conv3T,self.b_conv3T,self.W_fc1T,self.b_fc1T,self.W_fc2T,self.b_fc2T = self.createQNetwork()

    self.copyTargetQNetworkOperation = [self.W_conv1T.assign(self.W_conv1),self.b_conv1T.assign(self.b_conv1),self.W_conv2T.assign(self.W_conv2),self.b_conv2T.assign(self.b_conv2),self.W_conv3T.assign(self.W_conv3),self.b_conv3T.assign(self.b_conv3),self.W_fc1T.assign(self.W_fc1),self.b_fc1T.assign(self.b_fc1),self.W_fc2T.assign(self.W_fc2),self.b_fc2T.assign(self.b_fc2)]

    self.createTrainingMethod()

    # saving and loading networks
    self.saver = tf.train.Saver()
    self.session = tf.InteractiveSession()
    self.session.run(tf.initialize_all_variables())
    checkpoint = tf.train.get_checkpoint_state("saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
    self.saver.restore(self.session, checkpoint.model_checkpoint_path)
    print ("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
    print ("Could not find old network weights")


    def createQNetwork(self):
    # network weights
    W_conv1 = self.weight_variable([8,8,4,32])
    b_conv1 = self.bias_variable([32])

    W_conv2 = self.weight_variable([4,4,32,64])
    b_conv2 = self.bias_variable([64])

    W_conv3 = self.weight_variable([3,3,64,64])
    b_conv3 = self.bias_variable([64])

    W_fc1 = self.weight_variable([1600,512])
    b_fc1 = self.bias_variable([512])

    W_fc2 = self.weight_variable([512,self.actions])
    b_fc2 = self.bias_variable([self.actions])

    # input layer

    stateInput = tf.placeholder("float",[None,80,80,4])

    # hidden layers
    h_conv1 = tf.nn.relu(self.conv2d(stateInput,W_conv1,4) + b_conv1)
    h_pool1 = self.max_pool_2x2(h_conv1)

    h_conv2 = tf.nn.relu(self.conv2d(h_pool1,W_conv2,2) + b_conv2)

    h_conv3 = tf.nn.relu(self.conv2d(h_conv2,W_conv3,1) + b_conv3)

    h_conv3_flat = tf.reshape(h_conv3,[-1,1600])
    h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat,W_fc1) + b_fc1)

    # Q Value layer
    QValue = tf.matmul(h_fc1,W_fc2) + b_fc2

    return stateInput,QValue,W_conv1,b_conv1,W_conv2,b_conv2,W_conv3,b_conv3,W_fc1,b_fc1,W_fc2,b_fc2

    def copyTargetQNetwork(self):
    self.session.run(self.copyTargetQNetworkOperation)

    def createTrainingMethod(self):
    self.actionInput = tf.placeholder("float",[None,self.actions])
    self.yInput = tf.placeholder("float", [None])
    Q_Action = tf.reduce_sum(tf.mul(self.QValue, self.actionInput), reduction_indices = 1)
    self.cost = tf.reduce_mean(tf.square(self.yInput - Q_Action))
    self.trainStep = tf.train.AdamOptimizer(1e-6).minimize(self.cost)


    def trainQNetwork(self):


    # Step 1: obtain random minibatch from replay memory
    minibatch = random.sample(self.replayMemory,BATCH_SIZE)
    state_batch = [data[0] for data in minibatch]
    action_batch = [data[1] for data in minibatch]
    reward_batch = [data[2] for data in minibatch]
    nextState_batch = [data[3] for data in minibatch]

    # Step 2: calculate y
    y_batch = []
    QValue_batch = self.QValueT.eval(feed_dict={self.stateInputT:nextState_batch})
    for i in range(0,BATCH_SIZE):
    terminal = minibatch[i][4]
    if terminal:
    y_batch.append(reward_batch[i])
    else:
    y_batch.append(reward_batch[i] + GAMMA * np.max(QValue_batch[i]))

    self.trainStep.run(feed_dict={
    self.yInput : y_batch,
    self.actionInput : action_batch,
    self.stateInput : state_batch
    })

    # save network every 100000 iteration
    if self.timeStep % 10000 == 0:
    self.saver.save(self.session, 'saved_networks/' + 'network' + '-dqn', global_step = self.timeStep)

    if self.timeStep % UPDATE_TIME == 0:
    self.copyTargetQNetwork()


    def setPerception(self,nextObservation,action,reward,terminal):
    #newState = np.append(nextObservation,self.currentState[:,:,1:],axis = 2)
    newState = np.append(self.currentState[:,:,1:],nextObservation,axis = 2)
    self.replayMemory.append((self.currentState,action,reward,newState,terminal))
    if len(self.replayMemory) > REPLAY_MEMORY:
    self.replayMemory.popleft()
    if self.timeStep > OBSERVE:
    # Train the network
    self.trainQNetwork()

    # print info
    state = ""
    if self.timeStep <= OBSERVE:
    state = "observe"
    elif self.timeStep > OBSERVE and self.timeStep <= OBSERVE + EXPLORE:
    state = "explore"
    else:
    state = "train"

    print ("TIMESTEP", self.timeStep, "/ STATE", state, \
    "/ EPSILON", self.epsilon)

    self.currentState = newState
    self.timeStep += 1

    def getAction(self):
    QValue = self.QValue.eval(feed_dict= {self.stateInput:[self.currentState]})[0]
    action = np.zeros(self.actions)
    action_index = 0
    if self.timeStep % FRAME_PER_ACTION == 0:
    if random.random() <= self.epsilon:
    action_index = random.randrange(self.actions)
    action[action_index] = 1
    else:
    action_index = np.argmax(QValue)
    action[action_index] = 1
    else:
    action[0] = 1 # do nothing

    # change episilon
    if self.epsilon > FINAL_EPSILON and self.timeStep > OBSERVE:
    self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/EXPLORE

    return action

    def setInitState(self,observation):
    self.currentState = np.stack((observation, observation, observation, observation), axis = 2)

    def weight_variable(self,shape):
    initial = tf.random.truncated_normal(shape, stddev = 0.01)
    return tf.Variable(initial)

    def bias_variable(self,shape):
    initial = tf.constant(0.01, shape = shape)
    return tf.Variable(initial)

    def conv2d(self,x, W, stride):
    return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME")

    def max_pool_2x2(self,x):
    return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")



    模块2代码:
    # -----------------------------
    # File: Deep Q-Learning Algorithm
    # Author: Flood Sung
    # Date: 2016.3.21
    # -----------------------------

    import tensorflow._api.v2.compat.v1 as tf
    tf.disable_v2_behavior()
    import numpy as np
    import random
    from collections import deque

    # Hyper Parameters:
    FRAME_PER_ACTION = 1
    GAMMA = 0.99 # decay rate of past observations
    OBSERVE = 100. # timesteps to observe before training
    EXPLORE = 150000. # frames over which to anneal epsilon
    FINAL_EPSILON = 0.0 # final value of epsilon
    INITIAL_EPSILON = 0.9 # starting value of epsilon
    REPLAY_MEMORY = 50000 # number of previous transitions to remember
    BATCH_SIZE = 32 # size of minibatch

    class BrainDQN:

    def __init__(self,actions):
    # init replay memory
    self.replayMemory = deque()
    # init some parameters
    self.timeStep = 0
    self.epsilon = INITIAL_EPSILON
    self.actions = actions
    # init Q network
    self.createQNetwork()

    def createQNetwork(self):
    # network weights
    W_conv1 = self.weight_variable([8,8,4,32])
    b_conv1 = self.bias_variable([32])

    W_conv2 = self.weight_variable([4,4,32,64])
    b_conv2 = self.bias_variable([64])

    W_conv3 = self.weight_variable([3,3,64,64])
    b_conv3 = self.bias_variable([64])

    W_fc1 = self.weight_variable([1600,512])
    b_fc1 = self.bias_variable([512])

    W_fc2 = self.weight_variable([512,self.actions])
    b_fc2 = self.bias_variable([self.actions])

    # input layer

    self.stateInput = tf.placeholder("float",[None,80,80,4])

    # hidden layers
    h_conv1 = tf.nn.relu(self.conv2d(self.stateInput,W_conv1,4) + b_conv1)
    h_pool1 = self.max_pool_2x2(h_conv1)

    h_conv2 = tf.nn.relu(self.conv2d(h_pool1,W_conv2,2) + b_conv2)

    h_conv3 = tf.nn.relu(self.conv2d(h_conv2,W_conv3,1) + b_conv3)

    h_conv3_flat = tf.reshape(h_conv3,[-1,1600])
    h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat,W_fc1) + b_fc1)

    # Q Value layer
    self.QValue = tf.matmul(h_fc1,W_fc2) + b_fc2

    self.actionInput = tf.placeholder("float",[None,self.actions])
    self.yInput = tf.placeholder("float", [None])
    Q_action = tf.reduce_sum(tf.mul(self.QValue, self.actionInput), reduction_indices = 1)
    self.cost = tf.reduce_mean(tf.square(self.yInput - Q_action))
    self.trainStep = tf.train.AdamOptimizer(1e-6).minimize(self.cost)

    # saving and loading networks
    self.saver = tf.train.Saver()
    self.session = tf.InteractiveSession()
    self.session.run(tf.initialize_all_variables())
    checkpoint = tf.train.get_checkpoint_state("saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
    self.saver.restore(self.session, checkpoint.model_checkpoint_path)
    print ("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
    print ("Could not find old network weights")

    def trainQNetwork(self):
    # Step 1: obtain random minibatch from replay memory
    minibatch = random.sample(self.replayMemory,BATCH_SIZE)
    state_batch = [data[0] for data in minibatch]
    action_batch = [data[1] for data in minibatch]
    reward_batch = [data[2] for data in minibatch]
    nextState_batch = [data[3] for data in minibatch]

    # Step 2: calculate y
    y_batch = []
    QValue_batch = self.QValue.eval(feed_dict={self.stateInput:nextState_batch})
    for i in range(0,BATCH_SIZE):
    terminal = minibatch[i][4]
    if terminal:
    y_batch.append(reward_batch[i])
    else:
    y_batch.append(reward_batch[i] + GAMMA * np.max(QValue_batch[i]))

    self.trainStep.run(feed_dict={
    self.yInput : y_batch,
    self.actionInput : action_batch,
    self.stateInput : state_batch
    })

    # save network every 100000 iteration
    if self.timeStep % 10000 == 0:
    self.saver.save(self.session, 'saved_networks/' + 'network' + '-dqn', global_step = self.timeStep)


    def setPerception(self,nextObservation,action,reward,terminal):
    #newState = np.append(nextObservation,self.currentState[:,:,1:],axis = 2)
    newState = np.append(self.currentState[:,:,1:],nextObservation,axis = 2)
    self.replayMemory.append((self.currentState,action,reward,newState,terminal))
    if len(self.replayMemory) > REPLAY_MEMORY:
    self.replayMemory.popleft()
    if self.timeStep > OBSERVE:
    # Train the network
    self.trainQNetwork()

    self.currentState = newState
    self.timeStep += 1

    def getAction(self):
    QValue = self.QValue.eval(feed_dict= {self.stateInput:[self.currentState]})[0]
    action = np.zeros(self.actions)
    action_index = 0
    if self.timeStep % FRAME_PER_ACTION == 0:
    if random.random() <= self.epsilon:
    action_index = random.randrange(self.actions)
    action[action_index] = 1
    else:
    action_index = np.argmax(QValue)
    action[action_index] = 1
    else:
    action[0] = 1 # do nothing

    # change episilon
    if self.epsilon > FINAL_EPSILON and self.timeStep > OBSERVE:
    self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/EXPLORE

    return action

    def setInitState(self,observation):
    self.currentState = np.stack((observation, observation, observation, observation), axis = 2)

    def weight_variable(self,shape):
    initial = tf.truncated_normal(shape, stddev = 0.01)
    return tf.Variable(initial)

    def bias_variable(self,shape):
    initial = tf.constant(0.01, shape = shape)
    return tf.Variable(initial)

    def conv2d(self,x, W, stride):
    return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME")

    def max_pool_2x2(self,x):
    return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")


    有了这三个主要代码加上一些辅助代码就可以运行了;





  • 相关阅读:
    HDU1698(线段树入门题)
    POJ2528(离散化+线段树区间更新)
    POJ3630(Trie树)
    HDU1251(字典树)
    HDU1247(经典字典树)
    POJ2513(字典树+图的连通性判断)
    POJ1363
    UVa11624(逃离火焰问题)
    HDOJ1495(倒水BFS)
    poj3414Pots(倒水BFS)
  • 原文地址:https://www.cnblogs.com/092e/p/16183633.html
Copyright © 2020-2023  润新知