本篇是domain.py
1 # -*- coding: utf-8 -*- 2 """Contains example domains that LSPI works on.""" 3 # 包含了LSPI工作的域 4 5 import abc#引入了ABC父类 6 7 8 from random import randint, random 9 10 import numpy as np 11 12 from sample import Sample 13 14 15 class Domain(object): 16 17 r"""ABC for domains. 18 #domains类继承了ABC父类 19 Minimum interface for a reinforcement learning domain. 20 """ 21 22 __metaclass__ = abc.ABCMeta#继承了abc父类 23 24 @abc.abstractmethod #抽象方法 25 def num_actions(self): #动作的数量 26 """Return number of possible actions for the given domain. 27 #返回每一个域中可能有的动作的数量 28 Actions are indexed from 0 to num_actions - 1. 29 #动作的编码方法 30 Returns#返回值 31 -------# 32 int#动作的数量 33 Number of possible actions. 34 """ 35 pass # pragma: no cover 36 37 @abc.abstractmethod#抽象方法 38 def current_state(self):#当前的状态 39 """Return the current state of the domain.#返回了域中当前的状态 40 41 Returns#返回 42 -------# 43 numpy.array#一个矩阵 44 The current state of the environment expressed as a numpy array 45 of the individual state variables. 46 """ 47 pass # pragma: no cover 48 49 @abc.abstractmethod #抽象方法 50 def apply_action(self, action)#应用动作: 51 """Apply action and return a sample. 52 #应用动作,返回一个sample 53 Parameters#参数 54 ---------- 55 action: int#动作的编号 56 The action index to apply. This should be a number in the range 57 [0, num_actions()) 58 59 Returns#返回 60 ------- 61 sample.Sample#一个sanple 62 ##注意:!!每个sample中包含了过去的状态,应用的动作,应用动作之后的状态,以及返回的奖励情况 63 ##注意:这个函数就是代表了系统的状态更新情况 64 Sample containing the previous state, the action applied, the 65 received reward and the resulting state. 66 """ 67 pass # pragma: no cover 68 69 @abc.abstractmethod#抽象方法 70 def reset(self, initial_state=None):#重置 71 """Reset the simulator to initial conditions. 72 #将仿真器重置回初始状态 73 Parameters#参数 74 ----------#初始状态 75 initial_state: numpy.array 76 Optionally specify the state to reset to. If None then the domain 77 should use its default initial set of states. The type will 78 generally be a numpy.array, but a subclass may accept other types. 79 #就是将系统状态重置, 80 81 """ 82 pass # pragma: no cover 83 84 @abc.abstractmethod #抽显方法 85 def action_name(self, action):#返回action的名字 86 """Return a string representation of the action. 87 #返回动作的名字 88 Parameters 89 ---------- 90 action: int#输入动作的编号 91 The action index to apply. This number should be in the range 92 [0, num_actions()) 93 94 Returns 95 -------#返回动作的名字 96 str 97 String representation of the action index. 98 """ 99 pass # pragma: no cover 100 101 102 class ChainDomain(Domain):#定义序列域,一个域就是该算法作用的环境,比如序列域,还可能是平面小车,倒立摆,复杂的机器人系统等等 103 104 """Chain domain from LSPI paper. 105 #序列域应用于LSPI文献 106 #一个非常简单的MDP,通常用来测试LSPI的方法并且展示接口, 107 #状态空间是一系列的离散点.有两个动作:向左,或者向右 108 Very simple MDP. Used to test LSPI methods and demonstrate the interface. 109 The state space is a series of discrete nodes in a chain. There are two 110 actions: Left and Right. These actions fail with a configurable 111 probability. When the action fails to performs the opposite action. In 112 otherwords if left is the action applied, but it fails, then the agent will 113 actually move right (assuming it is not in the right most state). 114 #动作有一定的概率会失败,如果失败了就是向另一个方向走了 115 The default reward for any action in a state is 0. There are 2 special 116 states that will give a +1 reward for entering. The two special states can 117 be configured to appear at the end of the chain, in the middle, or 118 in the middle of each half of the state space. 119 #默认的每个动作的奖励是0,有两个特殊的状态,奖励是+1,这两个特殊的状态可以出现在序列的左边中间或者右边 120 Parameters#参数 121 ---------- 122 num_states: int#状态的个数 123 Number of states in the chain. Must be at least 4.#最少4个默认是10个 124 Defaults to 10 states. 125 reward_location: ChainDomain.RewardLoction #奖励位置 126 Location of the states with +1 rewards #有+1奖励的 127 failure_probability: float #动作失败的概率 128 The probability that the applied action will fail. Must be in range 129 [0, 1] 130 131 """ 132 133 class RewardLocation(object): 134 #奖励的位置用一个类来表示!! 135 136 """Location of states giving +1 reward in the chain. 137 138 Ends: 139 # 在链的末尾会给出奖励 140 Rewards will be given at the ends of the chain. 141 Middle: 142 #在链的中间的两个!!状态奖励 143 Rewards will be given at the middle two states of the chain. 144 HalfMiddles:#在链的两个半边的中间的两个状态给出奖励 145 Rewards will be given at the middle two states of each half 146 of the chain. 147 148 """ 149 #为什么这么取值? 150 Ends, Middle, HalfMiddles = range(3) 151 152 __action_names = ['left', 'right']#私有的变量,无法修改 153 #初始化函数 154 def __init__(self, num_states=10, 155 reward_location=RewardLocation.Ends, 156 failure_probability=.1): 157 """Initialize ChainDomain.""" 158 if num_states < 4:#检查一些量是否合格 159 raise ValueError('num_states must be >= 4') 160 if failure_probability < 0 or failure_probability > 1: 161 raise ValueError('failure_probability must be in range [0, 1]') 162 #成员变量:状态个数,奖励位置,失败概率 163 self.num_states = int(num_states) 164 self.reward_location = reward_location 165 self.failure_probability = failure_probability 166 167 self._state = ChainDomain.__init_random_state(num_states)#随机初始状态 168 169 def num_actions(self):#动作的数目,向左向右共有两个 170 """Return number of actions. 171 172 Chain domain has 2 actions. 173 174 Returns 175 ------- 176 int 177 Number of actions 178 179 """ 180 return 2 181 182 def current_state(self): #当前的状态 183 """Return the current state of the domain. 184 185 Returns 186 ------- 187 numpy.array 188 The current state as a 1D numpy vector of type int. 189 190 """ 191 return self._state #直接返回成员变量 192 193 def apply_action(self, action):#应用动作 194 """Apply the action to the chain. 195 #将动作应用到链 196 If left is applied then the occupied state index will decrease by 1. 197 Unless the agent is already at 0, in which case the state will not 198 change. 199 如果向左,那么状态数-1,除非状态在0,那么状态数不变 200 If right is applied then the occupied state index will increase by 1. 201 Unless the agent is already at num_states-1, in which case the state 202 will not change. 203 相反,向右就是+1 204 The reward function is determined by the reward location specified when 205 constructing the domain. 206 #奖励函数是由位置决定的,看是否是在奖励的位置上 207 If failure_probability is > 0 then there is the chance for the left 208 and right actions to fail. If the left action fails then the agent 209 will move right. Similarly if the right action fails then the agent 210 will move left. 211 #机器人有一定的概率会动作失败 212 Parameters#输入参数 213 ---------- 214 action: int#动作 215 Action index. Must be in range [0, num_actions()) 216 217 Returns返回 218 -------#一个采样值,注意采样值所包含的项目 219 sample.Sample 220 The sample for the applied action. 221 222 Raises#一些错误的定义 223 ------ 224 ValueError 225 If the action index is outside of the range [0, num_actions()) 226 227 """ 228 if action < 0 or action >= 2:#检查动作的标志是否合格 229 raise ValueError('Action index outside of bounds [0, %d)' % 230 self.num_actions()) 231 232 action_failed = False#动作失败初始化 233 if random() < self.failure_probability: 234 action_failed = True#看看产生的概率是否是让动作失败,这其实是系统自己的动态特性. 235 236 # this assumes that the state has one and only one occupied location 237 238 if (action == 0 and not action_failed) #这两行是在更新状态 239 or (action == 1 and action_failed): 240 new_location = max(0, self._state[0]-1) 241 else: 242 new_location = min(self.num_states-1, self._state[0]+1) 243 244 next_state = np.array([new_location]) 245 246 reward = 0#判断是否到达终点,来给动作的奖励进行赋值 247 if self.reward_location == ChainDomain.RewardLocation.Ends: 248 if new_location == 0 or new_location == self.num_states-1: 249 reward = 1 250 elif self.reward_location == ChainDomain.RewardLocation.Middle: 251 if new_location == int(self.num_states/2) 252 or new_location == int(self.num_states/2 + 1): 253 reward = 1 254 else: # HalfMiddles case 255 if new_location == int(self.num_states/4) 256 or new_location == int(3*self.num_states/4): 257 reward = 1 258 #将计算出的相应的数值付给sample 259 sample = Sample(self._state.copy(), action, reward, next_state.copy()) 260 261 self._state = next_state 262 263 return sample#返回一个采样 264 265 def reset(self, initial_state=None):#重置状态的函数 266 """Reset the domain to initial state or specified state. 267 如果state没有给出具体的值,那么就随机付给一个初值 268 If the state is unspecified then it will generate a random state, just 269 like when constructing from scratch. 270 状态和原始的状态维度要相等,状态值可以是0或者1 271 State must be the same size as the original state. State values can be 272 either 0 or 1. There must be one and only one location that contains 273 a value of 1. Whatever the numpy array type used, it will be converted 274 to an integer numpy array. 275 276 Parameters#输入 277 ---------- 278 initial_state: numpy.array 279 The state to set the simulator to. If None then set to a random 280 state. 281 282 Raises 283 ------ 284 ValueError 285 If initial state's shape does not match (num_states, ). In 286 otherwords the initial state must be a 1D numpy array with the 287 same length as the existing state. 288 ValueError 289 If part of the state has a value or 1, or there are multiple 290 parts of the state with value of 1. 291 ValueError 292 If there are values in the state other than 0 or 1. 293 294 """ 295 if initial_state is None: 296 self._state = ChainDomain.__init_random_state(self.num_states) 297 else: 298 if initial_state.shape != (1, ): 299 raise ValueError('The specified state did not match the ' 300 + 'current state size') 301 state = initial_state.astype(np.int) 302 if state[0] < 0 or state[0] >= self.num_states: 303 raise ValueError('State value must be in range ' 304 + '[0, num_states)') 305 self._state = state 306 307 def action_name(self, action):#返回动作名字 308 """Return string representation of actions. 309 310 0: 311 left 312 1: 313 right 314 315 Returns 316 ------- 317 str 318 String representation of action. 319 """ 320 return ChainDomain.__action_names[action] #返回私有成员 321 322 @staticmethod #静态函数,随机初始化状态 323 def __init_random_state(num_states): 324 """Return randomly initialized state of the specified size.""" 325 return np.array([randint(0, num_states-1)])