• 强化学习应用于游戏Tic-Tac-Toe


    Tic-Tac-Toe游戏为3*3格子里轮流下棋,一方先有3子成直线的为赢家。

    参考代码如下,我只删除了几个没用的地方:

    #######################################################################
    # Copyright (C)                                                       #
    # 2016 - 2018 Shangtong Zhang(zhangshangtong.cpp@gmail.com)           #
    # 2016 Jan Hakenberg(jan.hakenberg@gmail.com)                         #
    # 2016 Tian Jun(tianjun.cpp@gmail.com)                                #
    # 2016 Kenta Shimada(hyperkentakun@gmail.com)                         #
    # Permission given to modify the code as long as you keep this        #
    # declaration at the top                                              #
    #######################################################################
    ##https://www.cnblogs.com/pinard/p/9385570.html ##
    ## 强化学习(一)模型基础 ##
    
    import numpy as np
    import pickle
    
    BOARD_ROWS = 3
    BOARD_COLS = 3
    BOARD_SIZE = BOARD_ROWS * BOARD_COLS

    State状态类
    简要描述:每个状态用自定义hash值描述,主要方法为get_all_states(运行一次得到所有状态)和next_state(下一次棋,返回新的状态)

    class State:
        def __init__(self):
            # the board is represented by an n * n array,
            # 1 represents a chessman of the player who moves first,
            # -1 represents a chessman of another player
            # 0 represents an empty position
            self.data = np.zeros((BOARD_ROWS, BOARD_COLS))
            self.winner = None
            self.hash_val = None
            self.end = None
    
        # compute the hash value for one state, it's unique
        def hash(self):
            if self.hash_val is None:
                self.hash_val = 0
                for i in self.data.reshape(BOARD_ROWS * BOARD_COLS):
                    # 即原来取值-1,0,1,现在将-1设置为2,为了hash方便
                    if i == -1:
                        i = 2
                    self.hash_val = self.hash_val * 3 + i
            return int(self.hash_val)
    
        # check whether a player has won the game, or it's a tie
        def is_end(self):
            if self.end is not None:
                return self.end
            results = []
            # check row
            for i in range(0, BOARD_ROWS):
                results.append(np.sum(self.data[i, :]))
            # check columns
            for i in range(0, BOARD_COLS):
                results.append(np.sum(self.data[:, i]))
    
            # check diagonals
            results.append(0)
            for i in range(0, BOARD_ROWS):
                results[-1] += self.data[i, i]
            results.append(0)
            for i in range(0, BOARD_ROWS):
                results[-1] += self.data[i, BOARD_ROWS - 1 - i]
    
            for result in results:
                if result == 3:
                    self.winner = 1
                    self.end = True
                    return self.end
                if result == -3:
                    self.winner = -1
                    self.end = True
                    return self.end
    
            # whether it's a tie
            sum = np.sum(np.abs(self.data))
            if sum == BOARD_ROWS * BOARD_COLS:
                self.winner = 0
                self.end = True
                return self.end
    
            # game is still going on
            self.end = False
            return self.end
    
        # @symbol: 1 or -1
        # put chessman symbol in position (i, j)
        def next_state(self, i, j, symbol):
            new_state = State()
            new_state.data = np.copy(self.data)
            new_state.data[i, j] = symbol
            return new_state
    
        # print the board
        def print(self):
            for i in range(0, BOARD_ROWS):
                print('-------------')
                out = '| '
                for j in range(0, BOARD_COLS):
                    if self.data[i, j] == 1:
                        token = '*'
                    if self.data[i, j] == 0:
                        token = '0'
                    if self.data[i, j] == -1:
                        token = 'x'
                    out += token + ' | '
                print(out)
            print('-------------')
    
    def get_all_states_impl(current_state, current_symbol, all_states):
        '''
        all_states:字典,以hash值为key,value为(state,is_End)
        '''
        for i in range(0, BOARD_ROWS):
            for j in range(0, BOARD_COLS):
                if current_state.data[i][j] == 0:
                    newState = current_state.next_state(i, j, current_symbol)
                    newHash = newState.hash()
                    if newHash not in all_states.keys():
                        isEnd = newState.is_end()
                        all_states[newHash] = (newState, isEnd)
                        #如果没结束对局,下一个选手继续下
                        if not isEnd:
                            get_all_states_impl(newState, -current_symbol, all_states)
    
    def get_all_states():
        current_symbol = 1
        current_state = State()
        all_states = dict()
        all_states[current_state.hash()] = (current_state, current_state.is_end())
        get_all_states_impl(current_state, current_symbol, all_states)
        return all_states
    
    # all possible board configurations
    all_states = get_all_states()

    裁判:监督选手轮流下棋。主要方法为alternate(轮流选手),play(监督游戏执行,play里重要的为选手的act方法,后面讲)

    class Judger:
        # @player1: the player who will move first, its chessman will be 1
        # @player2: another player with a chessman -1
        # @feedback: if True, both players will receive rewards when game is end
        def __init__(self, player1, player2):
            self.p1 = player1
            self.p2 = player2
            self.p1_symbol = 1
            self.p2_symbol = -1
            self.p1.set_symbol(self.p1_symbol)
            self.p2.set_symbol(self.p2_symbol)
            self.current_state = State()
    
        def reset(self):
            self.p1.reset()
            self.p2.reset()
    
        def alternate(self):
            while True:
                yield self.p1
                yield self.p2
    
        # @print: if True, print each board during the game
        def play(self, print=False):
            alternator = self.alternate()
            self.reset()
            current_state=self.current_state
            self.p1.set_state(current_state)
            self.p2.set_state(current_state)
            while True:
                player = next(alternator)
                if print:
                    current_state.print()
                [i, j, symbol] = player.act()
                next_state_hash = current_state.next_state(i, j, symbol).hash()
                current_state, is_end = all_states[next_state_hash]
                self.p1.set_state(current_state)
                self.p2.set_state(current_state)
                if is_end:
                    if print:
                        current_state.print()
                    return current_state.winner

    AI选手:estimations表示不同状态下的分值,用以进行下一状态的选择,greedy区分随机行为,即随机行为不参与更新状态的分值
    主要方法为set_symbol(设置对于每个选手各状态分值的初始值),backup(更新状态分值,如果下一状态分值更高,那么当前状态的分值也要提高,即将长远的结果反作用到现在),act(获取下一步坐标)

    class Player:
        # @step_size: the step size to update estimations
        # @epsilon: the probability to explore
        def __init__(self, step_size=0.1, epsilon=0.1):
            self.estimations = dict()
            self.step_size = step_size
            self.epsilon = epsilon
            self.states = []
            self.greedy = []
    
        def reset(self):
            self.states = []
            self.greedy = []
    
        def set_state(self, state):
            self.states.append(state)
            self.greedy.append(True)
    
        def set_symbol(self, symbol):
            self.symbol = symbol
    #         对状态分值初始化,最终赢了得1分,输了不得分,平局0.5分,未到终局设置为0.5分
            for hash_val in all_states.keys():
                (state, is_end) = all_states[hash_val]
                if is_end:
                    if state.winner == self.symbol:
                        self.estimations[hash_val] = 1.0
                    elif state.winner == 0:
                        # we need to distinguish between a tie and a lose
                        self.estimations[hash_val] = 0.5
                    else:
                        self.estimations[hash_val] = 0
                else:
                    self.estimations[hash_val] = 0.5
    
        # update value estimation
        def backup(self):
            # for debug
            # print('player trajectory')
            # for state in self.states:
            #     state.print()
    
            self.states = [state.hash() for state in self.states]
    #         顺序更新
            for i in reversed(range(len(self.states) - 1)):
                state = self.states[i]
                td_error = self.greedy[i] * (self.estimations[self.states[i + 1]] - self.estimations[state])
                self.estimations[state] += self.step_size * td_error
    
        # choose an action based on the state
        def act(self):
            #取出当前(最后一个)状态
            state = self.states[-1]
            #下一步可能的状态的hash
            next_states = []
            #下一步可能的坐标
            next_positions = []
            for i in range(BOARD_ROWS):
                for j in range(BOARD_COLS):
                    if state.data[i, j] == 0:
                        next_positions.append([i, j])
                        next_states.append(state.next_state(i, j, self.symbol).hash())
            #小概率随机探索
            if np.random.rand() < self.epsilon:
                action = next_positions[np.random.randint(len(next_positions))]
                action.append(self.symbol)
    #             表示随机行为不参与价值更新
                self.greedy[-1] = False
                return action
            #大概率按奖励最高行动
            values = []
            for hash, pos in zip(next_states, next_positions):
                values.append((self.estimations[hash], pos))
            values.sort(key=lambda x: x[0], reverse=True)
            action = values[0][1]
            action.append(self.symbol)
            return action
    
        def save_policy(self):
            with open('policy_%s.bin' % ('first' if self.symbol == 1 else 'second'), 'wb') as f:
                pickle.dump(self.estimations, f)
    
        def load_policy(self):
            with open('policy_%s.bin' % ('first' if self.symbol == 1 else 'second'), 'rb') as f:
                self.estimations = pickle.load(f)

    人类选手:act方法为自己下棋

    # human interface
    # input a number to put a chessman
    # | q | w | e |
    # | a | s | d |
    # | z | x | c |
    class HumanPlayer:
        def __init__(self, **kwargs):
            self.symbol = None
            self.keys = ['q', 'w', 'e', 'a', 's', 'd', 'z', 'x', 'c']
            self.state = None
            return
    
        def reset(self):
            return
    
        def set_state(self, state):
            self.state = state
    
        def set_symbol(self, symbol):
            self.symbol = symbol
            return
    
        def backup(self, _):
            return
    
        def act(self):
            self.state.print()
            key = input("Input your position:")
            data = self.keys.index(key)
            i = data // int(BOARD_COLS)
            j = data % BOARD_COLS
            return (i, j, self.symbol)

    训练:

    def train(epochs):
        player1 = Player(epsilon=0.01)
        player2 = Player(epsilon=0.01)
        judger = Judger(player1, player2)
        player1_win = 0.0
        player2_win = 0.0
        for i in range(1, epochs + 1):
            winner = judger.play(print=False)
            if winner == 1:
                player1_win += 1
            if winner == -1:
                player2_win += 1
          # 输出2个选手的获胜概率,到最后基本是平局 if i%100==0: print('Epoch %d, player 1 win %.02f, player 2 win %.02f' % (i, player1_win / i, player2_win / i)) player1.backup() player2.backup() player1.save_policy() # 保存状态价值,其实训练获取的就是各状态分别对每个选手的价值 player2.save_policy()

    AI自测:

    def compete(turns):
        # 不允许随机行为
        player1 = Player(epsilon=0)
        player2 = Player(epsilon=0)
        judger = Judger(player1, player2)
        player1.load_policy()
        player2.load_policy()
        player1_win = 0.0
        player2_win = 0.0
        for i in range(0, turns):
            winner = judger.play()
            if winner == 1:
                player1_win += 1
            if winner == -1:
                player2_win += 1
            #judger.reset()
        print('%d turns, player 1 win %.02f, player 2 win %.02f' % (turns, player1_win / turns, player2_win / turns))

    人机大战:

    def play():
        while True:
            player1 = HumanPlayer()
            player2 = Player(epsilon=0)
            judger = Judger(player1, player2)
            player2.load_policy()
            winner = judger.play()
            if winner == player2.symbol:
                print("You lose!")
            elif winner == player1.symbol:
                print("You win!")
            else:
                print("It is a tie!")

    开始!

    if __name__ == '__main__':
        train(int(1e4))
        compete(int(1e3))
        play()
    训练结束后,战绩为Epoch 10000, player 1 win 0.08, player 2 win 0.03
    因为此时有一定随机行为(1%)
    当AI自测时,去除了随机性,结果为1000 turns, player 1 win 0.00, player 2 win 0.00
    可以看到,都是平局
    后面就是人机大战了,根本赢不了这个AI的。
  • 相关阅读:
    spring mvc 建立下拉框并进行验证demo
    spring mvc 使用jsr-303进行表单验证的方法介绍
    复习
    练习
    复习
    计算机网络
    liunx 和网络
    按钮
    几天不见 就记得这个架子了。。。。
    只能输入汉字登录页面
  • 原文地址:https://www.cnblogs.com/lunge-blog/p/11688543.html
Copyright © 2020-2023  润新知