这篇是policy.py
1 # -*- coding: utf-8 -*- 2 """LSPI Policy class used for learning and executing policy.""" 3 4 import random 5 6 import numpy as np 7 8 9 class Policy(object): 10 11 r"""Represents LSPI policy. Used for sampling, learning, and executing. 12 #表达了LSPI策略用于采样学习和执行 13 The policy class includes an exploration value which controls the 14 probability of performing a random action instead of the best action 15 according to the policy. This can be useful during sample. 16 17 It also includes the discount factor :math:`gamma`, number of possible 18 actions and the basis function used for this policy. 19 #policy类包含了探索值来控制执行随机动作的概率,而不是直接根据策略选择最佳的动作 20 #说明不是greedy的呗 21 #这在采样中非常有用 22 23 #包含了一个衰退因子gamma,一个可能的动作的个数,以及用于该策略的基函数 24 25 #参数 26 Parameters 27 ---------- 28 basis: BasisFunction#基函数:用来产生phi用来依据policy选择动作 29 The basis function used to compute :math:`phi` which is used to select 30 the best action according to the policy 31 discount: float, optional#衰退因子 32 The discount factor :math:`gamma`. Defaults to 1.0 which is valid 33 for finite horizon problems. 34 explore: float, optional#探索值 35 Probability of executing a random action instead of the best action 36 according to the policy. Defaults to 0 which is no exploration. 37 weights: numpy.array or None#权重,用来与phi点乘得到Q值 38 The weight vector which is dotted with the :math:`phi` vector from 39 basis to produce the approximate Q value. When None is passed in 40 the weight vector is initialized with random weights. 41 tie_breaking_strategy: Policy.TieBreakingStrategy value 42 The strategy to use if a tie occurs when selecting the best action. 43 See the :py:class:`lspi.policy.Policy.TieBreakingStrategy` 44 class description for what the different options are. 45 46 Raises#一些错误的定义 47 ------ 48 ValueError 49 If discount is < 0 or > 1 50 ValueError 51 If explore is < 0 or > 1 52 ValueError 53 If weights are not None and the number of dimensions does not match 54 the size of the basis function. 55 """ 56 57 class TieBreakingStrategy(object):#在类里又定义了一个类 58 59 """Strategy for breaking a tie between actions in the policy. 60 #用来打破动作和策略之间的连接 61 FirstWins:#返回对应某个值的第一次遇到的动作 62 In the event of a tie the first action encountered with that 63 value is returned. 64 LastWins:#对应某个值的最后一个动作 65 In the event of a tie the last action encountered with that 66 value is returned. 67 RandomWins#返回对应某个值的随机的动作 68 In the event of a tie a random action encountered with that 69 value is returned. 70 71 """ 72 73 FirstWins, LastWins, RandomWins = range(3)#初始化为0,1,2?什么意思 74 75 def __init__(self, basis, discount=1.0, 76 explore=0.0, weights=None, 77 tie_breaking_strategy=TieBreakingStrategy.RandomWins): 78 """Initialize a Policy.""" 79 self.basis = basis#初始化函数,存储基矩阵 80 81 if discount < 0.0 or discount > 1.0:#检查 82 raise ValueError('discount must be in range [0, 1]') 83 84 self.discount = discount#存储衰退因子 85 86 if explore < 0.0 or explore > 1.0:#检查探索因子 87 raise ValueError('explore must be in range [0, 1]') 88 89 self.explore = explore#存储探索因子 90 91 if weights is None:#权重初始化 92 self.weights = np.random.uniform(-1.0, 1.0, size=(basis.size(),)) 93 else: 94 if weights.shape != (basis.size(), ): 95 raise ValueError('weights shape must equal (basis.size(), 1)') 96 self.weights = weights 97 98 self.tie_breaking_strategy = tie_breaking_strategy#初始化打破连接的实例 99 100 def __copy__(self):#复制策略 101 """Return a copy of this class with a deep copy of the weights.""" 102 return Policy(self.basis, 103 self.discount, 104 self.explore, 105 self.weights.copy(), 106 self.tie_breaking_strategy) 107 108 def calc_q_value(self, state, action):#通过状态和动作计算Q值 109 """Calculate the Q function for the given state action pair. 110 #从给出的状态和动作对儿计算Q值 111 Parameters#参数 112 ---------- 113 state: numpy.array#状态向量 114 State vector that Q value is being calculated for. This is 115 the s in Q(s, a)#代表Q函数中的S 116 action: int#采用的动作 117 Action index that Q value is being calculated for. This is 118 the a in Q(s, a)#代表Q函数中的动作 119 120 Return#返回 121 ------ 122 float#返回Q值 123 The Q value for the state action pair 124 125 Raises#一些错误的定义 126 ------ 127 ValueError 128 If state's dimensions do not conform to basis function expectations 129 ValueError 130 If action is outside of the range of valid action indexes 131 132 """ 133 if action < 0 or action >= self.basis.num_actions:#检查一些量是否合格 134 raise IndexError('action must be in range [0, num_actions)') 135 136 return self.weights.dot(self.basis.evaluate(state, action))将权重与基函数返回的特征点乘 137 138 def best_action(self, state):#似乎是根据状态返回最佳动作 139 """Select the best action according to the policy. 140 #从policy中选择一个最佳动作 141 This calculates argmax_a Q(state, a). In otherwords it returns 142 the action that maximizes the Q value for this state. 143 144 Parameters 145 ----------#输入 146 state: numpy.array#状态向量 147 State vector. 148 tie_breaking_strategy: TieBreakingStrategy value#打破链接的策略值 149 In the event of a tie specifies which action the policy should 150 return. (Defaults to random)#定义了策略应当返回哪个动作 151 152 Returns#返回量 153 ------- 154 int#动作值 155 Action index 156 157 Raises#错误的定义 158 ------ 159 ValueError 160 If state's dimensions do not match basis functions expectations. 161 162 """ 163 #for 循环返回动作的个数 164 #q值通过点乘来计算,并构成一个列表 165 q_values = [self.calc_q_value(state, action) 166 for action in range(self.basis.num_actions)]##注意这个用法 167 168 best_q = float('-inf')#初始化一个最优的Q值 169 best_actions = []#初始化动作 170 for action, q_value in enumerate(q_values):#q_values向量中的每个q值的位置代表了动作! 171 if q_value > best_q:#如果当前的q_value比最高的要高 172 best_actions = [action]#更新一下最优的动作值和q值 173 best_q = q_value 174 elif q_value == best_q:#如果和最优的值相等?这也要讨论? 175 best_actions.append(action)#相等的化就添加到bset_action向量的后面? 176 177 #也就是说这个打破链接的策略仅仅在有多个Q值相等的情况下才有用 178 #也就是说存在多个最优动作导向同样的Q值的情况下才有用 179 if self.tie_breaking_strategy == Policy.TieBreakingStrategy.FirstWins: 180 return best_actions[0] 181 elif self.tie_breaking_strategy == Policy.TieBreakingStrategy.LastWins: 182 return best_actions[-1] 183 else: 184 return random.choice(best_actions) 185 186 def select_action(self, state):#上面是选择最优的动作,这里是完成epsilon greedy,有一定概率随机选择动作 187 """With random probability select best action or random action. 188 189 If the random number is below the explore value then pick a random 190 value otherwise pick the best action according to the basis and 191 policy weights. 192 193 Parameters 194 ---------- 195 state: numpy.array 196 State vector 197 198 Returns 199 ------- 200 int 201 Action index 202 203 Raises 204 ------ 205 ValueError 206 If state's dimensions do not match basis functions expectations. 207 208 """ 209 if random.random() < self.explore: 210 return random.choice(range(self.basis.num_actions)) 211 else: 212 return self.best_action(state) 213 214 @property#定义可以选择的动作的数目 215 def num_actions(self): 216 r"""Return number of possible actions. 217 218 This number should always match the value stored in basis.num_actions. 219 220 Return 221 ------ 222 int 223 Number of possible actions. In range [1, :math:`infty`) 224 225 """ 226 return self.basis.num_actions 227 228 @num_actions.setter 229 def num_actions(self, value): 230 """Set the number of possible actions. 231 232 This number should always match the value stored in basis.num_actions. 233 234 Parameters 235 ---------- 236 value: int 237 Value to set num_actions to. Must be >= 1 238 239 Raises 240 ------ 241 ValueError 242 If value is < 1 243 244 """ 245 self.basis.num_actions = value