• (六)Value Function Approximation-LSPI code (3)


    这篇是policy.py

      1 # -*- coding: utf-8 -*-
      2 """LSPI Policy class used for learning and executing policy."""
      3 
      4 import random
      5 
      6 import numpy as np
      7 
      8 
      9 class Policy(object):
     10 
     11     r"""Represents LSPI policy. Used for sampling, learning, and executing.
     12     #表达了LSPI策略用于采样学习和执行
     13     The policy class includes an exploration value which controls the
     14     probability of performing a random action instead of the best action
     15     according to the policy. This can be useful during sample.
     16 
     17     It also includes the discount factor :math:`gamma`, number of possible
     18     actions and the basis function used for this policy.
     19     #policy类包含了探索值来控制执行随机动作的概率,而不是直接根据策略选择最佳的动作
     20     #说明不是greedy的呗
     21     #这在采样中非常有用
     22     
     23     #包含了一个衰退因子gamma,一个可能的动作的个数,以及用于该策略的基函数
     24     
     25     #参数
     26     Parameters
     27     ----------
     28     basis: BasisFunction#基函数:用来产生phi用来依据policy选择动作
     29         The basis function used to compute :math:`phi` which is used to select
     30         the best action according to the policy
     31     discount: float, optional#衰退因子
     32         The discount factor :math:`gamma`. Defaults to 1.0 which is valid
     33         for finite horizon problems.
     34     explore: float, optional#探索值
     35         Probability of executing a random action instead of the best action
     36         according to the policy. Defaults to 0 which is no exploration.
     37     weights: numpy.array or None#权重,用来与phi点乘得到Q值
     38         The weight vector which is dotted with the :math:`phi` vector from
     39         basis to produce the approximate Q value. When None is passed in
     40         the weight vector is initialized with random weights.
     41     tie_breaking_strategy: Policy.TieBreakingStrategy value
     42         The strategy to use if a tie occurs when selecting the best action.
     43         See the :py:class:`lspi.policy.Policy.TieBreakingStrategy`
     44         class description for what the different options are.
     45 
     46     Raises#一些错误的定义
     47     ------
     48     ValueError
     49         If discount is < 0 or > 1
     50     ValueError
     51         If explore is < 0 or > 1
     52     ValueError
     53         If weights are not None and the number of dimensions does not match
     54         the size of the basis function.
     55     """
     56 
     57     class TieBreakingStrategy(object):#在类里又定义了一个类
     58 
     59         """Strategy for breaking a tie between actions in the policy.
     60         #用来打破动作和策略之间的连接
     61         FirstWins:#返回对应某个值的第一次遇到的动作
     62             In the event of a tie the first action encountered with that
     63             value is returned.
     64         LastWins:#对应某个值的最后一个动作
     65             In the event of a tie the last action encountered with that
     66             value is returned.
     67         RandomWins#返回对应某个值的随机的动作
     68             In the event of a tie a random action encountered with that
     69             value is returned.
     70 
     71         """
     72 
     73         FirstWins, LastWins, RandomWins = range(3)#初始化为0,1,2?什么意思
     74 
     75     def __init__(self, basis, discount=1.0,
     76                  explore=0.0, weights=None,
     77                  tie_breaking_strategy=TieBreakingStrategy.RandomWins):
     78         """Initialize a Policy."""
     79         self.basis = basis#初始化函数,存储基矩阵
     80 
     81         if discount < 0.0 or discount > 1.0:#检查
     82             raise ValueError('discount must be in range [0, 1]')
     83 
     84         self.discount = discount#存储衰退因子
     85 
     86         if explore < 0.0 or explore > 1.0:#检查探索因子
     87             raise ValueError('explore must be in range [0, 1]')
     88 
     89         self.explore = explore#存储探索因子
     90 
     91         if weights is None:#权重初始化
     92             self.weights = np.random.uniform(-1.0, 1.0, size=(basis.size(),))
     93         else:
     94             if weights.shape != (basis.size(), ):
     95                 raise ValueError('weights shape must equal (basis.size(), 1)')
     96             self.weights = weights
     97 
     98         self.tie_breaking_strategy = tie_breaking_strategy#初始化打破连接的实例
     99 
    100     def __copy__(self):#复制策略
    101         """Return a copy of this class with a deep copy of the weights."""
    102         return Policy(self.basis,
    103                       self.discount,
    104                       self.explore,
    105                       self.weights.copy(),
    106                       self.tie_breaking_strategy)
    107 
    108     def calc_q_value(self, state, action):#通过状态和动作计算Q值
    109         """Calculate the Q function for the given state action pair.
    110         #从给出的状态和动作对儿计算Q值
    111         Parameters#参数
    112         ----------
    113         state: numpy.array#状态向量
    114             State vector that Q value is being calculated for. This is
    115             the s in Q(s, a)#代表Q函数中的S
    116         action: int#采用的动作
    117             Action index that Q value is being calculated for. This is
    118             the a in Q(s, a)#代表Q函数中的动作
    119 
    120         Return#返回
    121         ------
    122         float#返回Q值
    123             The Q value for the state action pair
    124 
    125         Raises#一些错误的定义
    126         ------
    127         ValueError
    128             If state's dimensions do not conform to basis function expectations
    129         ValueError
    130             If action is outside of the range of valid action indexes
    131 
    132         """
    133         if action < 0 or action >= self.basis.num_actions:#检查一些量是否合格
    134             raise IndexError('action must be in range [0, num_actions)')
    135 
    136         return self.weights.dot(self.basis.evaluate(state, action))将权重与基函数返回的特征点乘
    137 
    138     def best_action(self, state):#似乎是根据状态返回最佳动作
    139         """Select the best action according to the policy.
    140         #从policy中选择一个最佳动作
    141         This calculates argmax_a Q(state, a). In otherwords it returns
    142         the action that maximizes the Q value for this state.
    143 
    144         Parameters
    145         ----------#输入
    146         state: numpy.array#状态向量
    147             State vector.
    148         tie_breaking_strategy: TieBreakingStrategy value#打破链接的策略值
    149             In the event of a tie specifies which action the policy should
    150             return. (Defaults to random)#定义了策略应当返回哪个动作
    151 
    152         Returns#返回量
    153         -------
    154         int#动作值
    155             Action index
    156 
    157         Raises#错误的定义
    158         ------
    159         ValueError
    160             If state's dimensions do not match basis functions expectations.
    161 
    162         """
    163for 循环返回动作的个数
    164         #q值通过点乘来计算,并构成一个列表
    165         q_values = [self.calc_q_value(state, action)
    166                     for action in range(self.basis.num_actions)]##注意这个用法
    167 
    168         best_q = float('-inf')#初始化一个最优的Q值
    169         best_actions = []#初始化动作
    170         for action, q_value in enumerate(q_values):#q_values向量中的每个q值的位置代表了动作!
    171             if q_value > best_q:#如果当前的q_value比最高的要高
    172                 best_actions = [action]#更新一下最优的动作值和q值
    173                 best_q = q_value
    174             elif q_value == best_q:#如果和最优的值相等?这也要讨论?
    175                 best_actions.append(action)#相等的化就添加到bset_action向量的后面?
    176             
    177             #也就是说这个打破链接的策略仅仅在有多个Q值相等的情况下才有用
    178             #也就是说存在多个最优动作导向同样的Q值的情况下才有用
    179         if self.tie_breaking_strategy == Policy.TieBreakingStrategy.FirstWins:
    180             return best_actions[0]
    181         elif self.tie_breaking_strategy == Policy.TieBreakingStrategy.LastWins:
    182             return best_actions[-1]
    183         else:
    184             return random.choice(best_actions)
    185 
    186     def select_action(self, state):#上面是选择最优的动作,这里是完成epsilon greedy,有一定概率随机选择动作
    187         """With random probability select best action or random action.
    188 
    189         If the random number is below the explore value then pick a random
    190         value otherwise pick the best action according to the basis and
    191         policy weights.
    192 
    193         Parameters
    194         ----------
    195         state: numpy.array
    196             State vector
    197 
    198         Returns
    199         -------
    200         int
    201             Action index
    202 
    203         Raises
    204         ------
    205         ValueError
    206             If state's dimensions do not match basis functions expectations.
    207 
    208         """
    209         if random.random() < self.explore:
    210             return random.choice(range(self.basis.num_actions))
    211         else:
    212             return self.best_action(state)
    213 
    214     @property#定义可以选择的动作的数目
    215     def num_actions(self):
    216         r"""Return number of possible actions.
    217 
    218         This number should always match the value stored in basis.num_actions.
    219 
    220         Return
    221         ------
    222         int
    223             Number of possible actions. In range [1, :math:`infty`)
    224 
    225         """
    226         return self.basis.num_actions
    227 
    228     @num_actions.setter
    229     def num_actions(self, value):
    230         """Set the number of possible actions.
    231 
    232         This number should always match the value stored in basis.num_actions.
    233 
    234         Parameters
    235         ----------
    236         value: int
    237             Value to set num_actions to. Must be >= 1
    238 
    239         Raises
    240         ------
    241         ValueError
    242             If value is < 1
    243 
    244         """
    245         self.basis.num_actions = value
  • 相关阅读:
    使用PaintCode便捷地实现动画效果
    程序员常用markdown语法记忆小结之博客园markdown编辑器的效果
    kafka-重复消费-1
    nosql
    ThreadLocal
    内存溢出、内存泄漏
    springboot邮件服务
    三次握手、四次挥手
    悲观锁乐观锁简单整理
    beanstalkd
  • 原文地址:https://www.cnblogs.com/lijiajun/p/5488482.html
Copyright © 2020-2023  润新知