(六)Value Function Approximation-LSPI code (3)

这篇是policy.py
  1 # -*- coding: utf-8 -*-
  2 """LSPI Policy class used for learning and executing policy."""
  3 
  4 import random
  5 
  6 import numpy as np
  7 
  8 
  9 class Policy(object):
 10 
 11     r"""Represents LSPI policy. Used for sampling, learning, and executing.
 12     #表达了ＬＳＰＩ策略用于采样学习和执行
 13     The policy class includes an exploration value which controls the
 14     probability of performing a random action instead of the best action
 15     according to the policy. This can be useful during sample.
 16 
 17     It also includes the discount factor :math:`gamma`, number of possible
 18     actions and the basis function used for this policy.
 19     ＃ｐｏｌｉｃｙ类包含了探索值来控制执行随机动作的概率，而不是直接根据策略选择最佳的动作
 20     ＃说明不是ｇｒｅｅｄｙ的呗
 21     ＃这在采样中非常有用
 22     
 23     ＃包含了一个衰退因子gamma，一个可能的动作的个数，以及用于该策略的基函数
 24     
 25     ＃参数
 26     Parameters
 27     ----------
 28     basis: BasisFunction＃基函数：用来产生phi用来依据ｐｏｌｉｃｙ选择动作
 29         The basis function used to compute :math:`phi` which is used to select
 30         the best action according to the policy
 31     discount: float, optional＃衰退因子
 32         The discount factor :math:`gamma`. Defaults to 1.0 which is valid
 33         for finite horizon problems.
 34     explore: float, optional＃探索值
 35         Probability of executing a random action instead of the best action
 36         according to the policy. Defaults to 0 which is no exploration.
 37     weights: numpy.array or None＃权重，用来与phi点乘得到Ｑ值
 38         The weight vector which is dotted with the :math:`phi` vector from
 39         basis to produce the approximate Q value. When None is passed in
 40         the weight vector is initialized with random weights.
 41     tie_breaking_strategy: Policy.TieBreakingStrategy value
 42         The strategy to use if a tie occurs when selecting the best action.
 43         See the :py:class:`lspi.policy.Policy.TieBreakingStrategy`
 44         class description for what the different options are.
 45 
 46     Raises＃一些错误的定义
 47     ------
 48     ValueError
 49         If discount is < 0 or > 1
 50     ValueError
 51         If explore is < 0 or > 1
 52     ValueError
 53         If weights are not None and the number of dimensions does not match
 54         the size of the basis function.
 55     """
 56 
 57     class TieBreakingStrategy(object):＃在类里又定义了一个类
 58 
 59         """Strategy for breaking a tie between actions in the policy.
 60         ＃用来打破动作和策略之间的连接
 61         FirstWins:#返回对应某个值的第一次遇到的动作
 62             In the event of a tie the first action encountered with that
 63             value is returned.
 64         LastWins:#对应某个值的最后一个动作
 65             In the event of a tie the last action encountered with that
 66             value is returned.
 67         RandomWins＃返回对应某个值的随机的动作
 68             In the event of a tie a random action encountered with that
 69             value is returned.
 70 
 71         """
 72 
 73         FirstWins, LastWins, RandomWins = range(3)＃初始化为０，１，２？什么意思
 74 
 75     def __init__(self, basis, discount=1.0,
 76                  explore=0.0, weights=None,
 77                  tie_breaking_strategy=TieBreakingStrategy.RandomWins):
 78         """Initialize a Policy."""
 79         self.basis = basis＃初始化函数，存储基矩阵
 80 
 81         if discount < 0.0 or discount > 1.0:＃检查
 82             raise ValueError('discount must be in range [0, 1]')
 83 
 84         self.discount = discount＃存储衰退因子
 85 
 86         if explore < 0.0 or explore > 1.0:＃检查探索因子
 87             raise ValueError('explore must be in range [0, 1]')
 88 
 89         self.explore = explore＃存储探索因子
 90 
 91         if weights is None:＃权重初始化
 92             self.weights = np.random.uniform(-1.0, 1.0, size=(basis.size(),))
 93         else:
 94             if weights.shape != (basis.size(), ):
 95                 raise ValueError('weights shape must equal (basis.size(), 1)')
 96             self.weights = weights
 97 
 98         self.tie_breaking_strategy = tie_breaking_strategy＃初始化打破连接的实例
 99 
100     def __copy__(self):＃复制策略
101         """Return a copy of this class with a deep copy of the weights."""
102         return Policy(self.basis,
103                       self.discount,
104                       self.explore,
105                       self.weights.copy(),
106                       self.tie_breaking_strategy)
107 
108     def calc_q_value(self, state, action):＃通过状态和动作计算Ｑ值
109         """Calculate the Q function for the given state action pair.
110         ＃从给出的状态和动作对儿计算Ｑ值
111         Parameters＃参数
112         ----------
113         state: numpy.array＃状态向量
114             State vector that Q value is being calculated for. This is
115             the s in Q(s, a)＃代表Ｑ函数中的Ｓ
116         action: int＃采用的动作
117             Action index that Q value is being calculated for. This is
118             the a in Q(s, a)＃代表Ｑ函数中的动作
119 
120         Return＃返回
121         ------
122         float＃返回Ｑ值
123             The Q value for the state action pair
124 
125         Raises＃一些错误的定义
126         ------
127         ValueError
128             If state's dimensions do not conform to basis function expectations
129         ValueError
130             If action is outside of the range of valid action indexes
131 
132         """
133         if action < 0 or action >= self.basis.num_actions:＃检查一些量是否合格
134             raise IndexError('action must be in range [0, num_actions)')
135 
136         return self.weights.dot(self.basis.evaluate(state, action))将权重与基函数返回的特征点乘
137 
138     def best_action(self, state):＃似乎是根据状态返回最佳动作
139         """Select the best action according to the policy.
140         ＃从ｐｏｌｉｃｙ中选择一个最佳动作
141         This calculates argmax_a Q(state, a). In otherwords it returns
142         the action that maximizes the Q value for this state.
143 
144         Parameters
145         ----------＃输入
146         state: numpy.array＃状态向量
147             State vector.
148         tie_breaking_strategy: TieBreakingStrategy value＃打破链接的策略值
149             In the event of a tie specifies which action the policy should
150             return. (Defaults to random)＃定义了策略应当返回哪个动作
151 
152         Returns＃返回量
153         -------
154         int＃动作值
155             Action index
156 
157         Raises＃错误的定义
158         ------
159         ValueError
160             If state's dimensions do not match basis functions expectations.
161 
162         """
163         ＃for　循环返回动作的个数
164         ＃ｑ值通过点乘来计算，并构成一个列表
165         q_values = [self.calc_q_value(state, action)
166                     for action in range(self.basis.num_actions)]＃＃注意这个用法
167 
168         best_q = float('-inf')＃初始化一个最优的Ｑ值
169         best_actions = []＃初始化动作
170         for action, q_value in enumerate(q_values):＃q_values向量中的每个ｑ值的位置代表了动作！
171             if q_value > best_q:＃如果当前的q_value比最高的要高
172                 best_actions = [action]＃更新一下最优的动作值和q值
173                 best_q = q_value
174             elif q_value == best_q:＃如果和最优的值相等？这也要讨论？
175                 best_actions.append(action)＃相等的化就添加到bset_action向量的后面？
176             
177             ＃也就是说这个打破链接的策略仅仅在有多个Ｑ值相等的情况下才有用
178             ＃也就是说存在多个最优动作导向同样的Ｑ值的情况下才有用
179         if self.tie_breaking_strategy == Policy.TieBreakingStrategy.FirstWins:
180             return best_actions[0]
181         elif self.tie_breaking_strategy == Policy.TieBreakingStrategy.LastWins:
182             return best_actions[-1]
183         else:
184             return random.choice(best_actions)
185 
186     def select_action(self, state):＃上面是选择最优的动作，这里是完成epsilon　greedy,有一定概率随机选择动作
187         """With random probability select best action or random action.
188 
189         If the random number is below the explore value then pick a random
190         value otherwise pick the best action according to the basis and
191         policy weights.
192 
193         Parameters
194         ----------
195         state: numpy.array
196             State vector
197 
198         Returns
199         -------
200         int
201             Action index
202 
203         Raises
204         ------
205         ValueError
206             If state's dimensions do not match basis functions expectations.
207 
208         """
209         if random.random() < self.explore:
210             return random.choice(range(self.basis.num_actions))
211         else:
212             return self.best_action(state)
213 
214     @property＃定义可以选择的动作的数目
215     def num_actions(self):
216         r"""Return number of possible actions.
217 
218         This number should always match the value stored in basis.num_actions.
219 
220         Return
221         ------
222         int
223             Number of possible actions. In range [1, :math:`infty`)
224 
225         """
226         return self.basis.num_actions
227 
228     @num_actions.setter
229     def num_actions(self, value):
230         """Set the number of possible actions.
231 
232         This number should always match the value stored in basis.num_actions.
233 
234         Parameters
235         ----------
236         value: int
237             Value to set num_actions to. Must be >= 1
238 
239         Raises
240         ------
241         ValueError
242             If value is < 1
243 
244         """
245         self.basis.num_actions = value
相关阅读:
使用PaintCode便捷地实现动画效果
 程序员常用markdown语法记忆小结之博客园markdown编辑器的效果
 kafka-重复消费-1
nosql
ThreadLocal
内存溢出、内存泄漏
 springboot邮件服务
 三次握手、四次挥手
 悲观锁乐观锁简单整理
 beanstalkd
原文地址：https://www.cnblogs.com/lijiajun/p/5488482.html