• (六)Value Function Approximation-LSPI code (6)


    本篇是domain.py

      1 # -*- coding: utf-8 -*-
      2 """Contains example domains that LSPI works on."""
      3 # 包含了LSPI工作的域
      4 
      5 import abc#引入了ABC父类
      6 
      7 
      8 from random import randint, random
      9 
     10 import numpy as np
     11 
     12 from sample import Sample
     13 
     14 
     15 class Domain(object):
     16 
     17     r"""ABC for domains.
     18     #domains类继承了ABC父类
     19     Minimum interface for a reinforcement learning domain.
     20     """
     21 
     22     __metaclass__ = abc.ABCMeta#继承了abc父类
     23 
     24     @abc.abstractmethod #抽象方法
     25     def num_actions(self): #动作的数量
     26         """Return number of possible actions for the given domain.
     27         #返回每一个域中可能有的动作的数量
     28         Actions are indexed from 0 to num_actions - 1.
     29         #动作的编码方法
     30         Returns#返回值
     31         -------#
     32         int#动作的数量
     33             Number of possible actions.
     34         """
     35         pass  # pragma: no cover
     36 
     37     @abc.abstractmethod#抽象方法
     38     def current_state(self):#当前的状态
     39         """Return the current state of the domain.#返回了域中当前的状态
     40 
     41         Returns#返回
     42         -------#
     43         numpy.array#一个矩阵
     44             The current state of the environment expressed as a numpy array
     45             of the individual state variables.
     46         """
     47         pass  # pragma: no cover
     48 
     49     @abc.abstractmethod #抽象方法
     50     def apply_action(self, action)#应用动作:
     51         """Apply action and return a sample.
     52         #应用动作,返回一个sample
     53         Parameters#参数
     54         ----------
     55         action: int#动作的编号
     56             The action index to apply. This should be a number in the range
     57             [0, num_actions())
     58 
     59         Returns#返回
     60         -------
     61         sample.Sample#一个sanple
     62         ##注意:!!每个sample中包含了过去的状态,应用的动作,应用动作之后的状态,以及返回的奖励情况
     63         ##注意:这个函数就是代表了系统的状态更新情况
     64             Sample containing the previous state, the action applied, the
     65             received reward and the resulting state.
     66         """
     67         pass  # pragma: no cover
     68 
     69     @abc.abstractmethod#抽象方法
     70     def reset(self, initial_state=None):#重置
     71         """Reset the simulator to initial conditions.
     72         #将仿真器重置回初始状态
     73         Parameters#参数
     74         ----------#初始状态
     75         initial_state: numpy.array
     76             Optionally specify the state to reset to. If None then the domain
     77             should use its default initial set of states. The type will
     78             generally be a numpy.array, but a subclass may accept other types.
     79             #就是将系统状态重置,
     80 
     81         """
     82         pass  # pragma: no cover
     83 
     84     @abc.abstractmethod #抽显方法
     85     def action_name(self, action):#返回action的名字
     86         """Return a string representation of the action.
     87         #返回动作的名字
     88         Parameters
     89         ----------
     90         action: int#输入动作的编号
     91             The action index to apply. This number should be in the range
     92             [0, num_actions())
     93 
     94         Returns
     95         -------#返回动作的名字
     96         str
     97             String representation of the action index.
     98         """
     99         pass  # pragma: no cover
    100 
    101 
    102 class ChainDomain(Domain):#定义序列域,一个域就是该算法作用的环境,比如序列域,还可能是平面小车,倒立摆,复杂的机器人系统等等
    103 
    104     """Chain domain from LSPI paper.
    105     #序列域应用于LSPI文献
    106     #一个非常简单的MDP,通常用来测试LSPI的方法并且展示接口,
    107     #状态空间是一系列的离散点.有两个动作:向左,或者向右
    108     Very simple MDP. Used to test LSPI methods and demonstrate the interface.
    109     The state space is a series of discrete nodes in a chain. There are two
    110     actions: Left and Right. These actions fail with a configurable
    111     probability. When the action fails to performs the opposite action. In
    112     otherwords if left is the action applied, but it fails, then the agent will
    113     actually move right (assuming it is not in the right most state).
    114     #动作有一定的概率会失败,如果失败了就是向另一个方向走了
    115     The default reward for any action in a state is 0. There are 2 special
    116     states that will give a +1 reward for entering. The two special states can
    117     be configured to appear at the end of the chain, in the middle, or
    118     in the middle of each half of the state space.
    119     #默认的每个动作的奖励是0,有两个特殊的状态,奖励是+1,这两个特殊的状态可以出现在序列的左边中间或者右边
    120     Parameters#参数
    121     ----------
    122     num_states: int#状态的个数
    123         Number of states in the chain. Must be at least 4.#最少4个默认是10个
    124         Defaults to 10 states.
    125     reward_location: ChainDomain.RewardLoction #奖励位置
    126         Location of the states with +1 rewards #有+1奖励的
    127     failure_probability: float #动作失败的概率
    128         The probability that the applied action will fail. Must be in range
    129         [0, 1]
    130 
    131     """
    132 
    133     class RewardLocation(object):
    134         #奖励的位置用一个类来表示!!
    135 
    136         """Location of states giving +1 reward in the chain.
    137 
    138         Ends:
    139         # 在链的末尾会给出奖励
    140             Rewards will be given at the ends of the chain.
    141         Middle:
    142         #在链的中间的两个!!状态奖励
    143             Rewards will be given at the middle two states of the chain.
    144         HalfMiddles:#在链的两个半边的中间的两个状态给出奖励
    145             Rewards will be given at the middle two states of each half
    146             of the chain.
    147 
    148         """
    149         #为什么这么取值?
    150         Ends, Middle, HalfMiddles = range(3)
    151 
    152     __action_names = ['left', 'right']#私有的变量,无法修改
    153     #初始化函数
    154     def __init__(self, num_states=10,
    155                  reward_location=RewardLocation.Ends,
    156                  failure_probability=.1):
    157         """Initialize ChainDomain."""
    158         if num_states < 4:#检查一些量是否合格
    159             raise ValueError('num_states must be >= 4')
    160         if failure_probability < 0 or failure_probability > 1:
    161             raise ValueError('failure_probability must be in range [0, 1]')
    162         #成员变量:状态个数,奖励位置,失败概率
    163         self.num_states = int(num_states)
    164         self.reward_location = reward_location
    165         self.failure_probability = failure_probability
    166 
    167         self._state = ChainDomain.__init_random_state(num_states)#随机初始状态
    168 
    169     def num_actions(self):#动作的数目,向左向右共有两个
    170         """Return number of actions.
    171 
    172         Chain domain has 2 actions.
    173 
    174         Returns
    175         -------
    176         int
    177             Number of actions
    178 
    179         """
    180         return 2
    181 
    182     def current_state(self): #当前的状态
    183         """Return the current state of the domain.
    184 
    185         Returns
    186         -------
    187         numpy.array
    188             The current state as a 1D numpy vector of type int.
    189 
    190         """
    191         return self._state #直接返回成员变量
    192 
    193     def apply_action(self, action):#应用动作
    194         """Apply the action to the chain.
    195         #将动作应用到链
    196         If left is applied then the occupied state index will decrease by 1.
    197         Unless the agent is already at 0, in which case the state will not
    198         change.
    199         如果向左,那么状态数-1,除非状态在0,那么状态数不变
    200         If right is applied then the occupied state index will increase by 1.
    201         Unless the agent is already at num_states-1, in which case the state
    202         will not change.
    203         相反,向右就是+1
    204         The reward function is determined by the reward location specified when
    205         constructing the domain.
    206         #奖励函数是由位置决定的,看是否是在奖励的位置上
    207         If failure_probability is > 0 then there is the chance for the left
    208         and right actions to fail. If the left action fails then the agent
    209         will move right. Similarly if the right action fails then the agent
    210         will move left.
    211         #机器人有一定的概率会动作失败
    212         Parameters#输入参数
    213         ----------
    214         action: int#动作
    215             Action index. Must be in range [0, num_actions())
    216 
    217         Returns返回
    218         -------#一个采样值,注意采样值所包含的项目
    219         sample.Sample
    220             The sample for the applied action.
    221 
    222         Raises#一些错误的定义
    223         ------
    224         ValueError
    225             If the action index is outside of the range [0, num_actions())
    226 
    227         """
    228         if action < 0 or action >= 2:#检查动作的标志是否合格
    229             raise ValueError('Action index outside of bounds [0, %d)' %
    230                              self.num_actions())
    231 
    232         action_failed = False#动作失败初始化
    233         if random() < self.failure_probability:
    234             action_failed = True#看看产生的概率是否是让动作失败,这其实是系统自己的动态特性.
    235 
    236         # this assumes that the state has one and only one occupied location
    237         
    238         if (action == 0 and not action_failed) #这两行是在更新状态
    239                 or (action == 1 and action_failed):
    240             new_location = max(0, self._state[0]-1)
    241         else:
    242             new_location = min(self.num_states-1, self._state[0]+1)
    243 
    244         next_state = np.array([new_location])
    245 
    246         reward = 0#判断是否到达终点,来给动作的奖励进行赋值
    247         if self.reward_location == ChainDomain.RewardLocation.Ends:
    248             if new_location == 0 or new_location == self.num_states-1:
    249                 reward = 1
    250         elif self.reward_location == ChainDomain.RewardLocation.Middle:
    251             if new_location == int(self.num_states/2) 
    252                     or new_location == int(self.num_states/2 + 1):
    253                 reward = 1
    254         else:  # HalfMiddles case
    255             if new_location == int(self.num_states/4) 
    256                     or new_location == int(3*self.num_states/4):
    257                 reward = 1
    258         #将计算出的相应的数值付给sample
    259         sample = Sample(self._state.copy(), action, reward, next_state.copy())
    260 
    261         self._state = next_state
    262 
    263         return sample#返回一个采样
    264 
    265     def reset(self, initial_state=None):#重置状态的函数
    266         """Reset the domain to initial state or specified state.
    267         如果state没有给出具体的值,那么就随机付给一个初值
    268         If the state is unspecified then it will generate a random state, just
    269         like when constructing from scratch.
    270         状态和原始的状态维度要相等,状态值可以是0或者1
    271         State must be the same size as the original state. State values can be
    272         either 0 or 1. There must be one and only one location that contains
    273         a value of 1. Whatever the numpy array type used, it will be converted
    274         to an integer numpy array.
    275 
    276         Parameters#输入
    277         ----------
    278         initial_state: numpy.array
    279             The state to set the simulator to. If None then set to a random
    280             state.
    281 
    282         Raises
    283         ------
    284         ValueError
    285             If initial state's shape does not match (num_states, ). In
    286             otherwords the initial state must be a 1D numpy array with the
    287             same length as the existing state.
    288         ValueError
    289             If part of the state has a value or 1, or there are multiple
    290             parts of the state with value of 1.
    291         ValueError
    292             If there are values in the state other than 0 or 1.
    293 
    294         """
    295         if initial_state is None:
    296             self._state = ChainDomain.__init_random_state(self.num_states)
    297         else:
    298             if initial_state.shape != (1, ):
    299                 raise ValueError('The specified state did not match the '
    300                                  + 'current state size')
    301             state = initial_state.astype(np.int)
    302             if state[0] < 0 or state[0] >= self.num_states:
    303                 raise ValueError('State value must be in range '
    304                                  + '[0, num_states)')
    305             self._state = state
    306 
    307     def action_name(self, action):#返回动作名字
    308         """Return string representation of actions.
    309 
    310         0:
    311             left
    312         1:
    313             right
    314 
    315         Returns
    316         -------
    317         str
    318             String representation of action.
    319         """
    320         return ChainDomain.__action_names[action] #返回私有成员
    321 
    322     @staticmethod #静态函数,随机初始化状态
    323     def __init_random_state(num_states):
    324         """Return randomly initialized state of the specified size."""
    325         return np.array([randint(0, num_states-1)])
  • 相关阅读:
    介绍一种很好用的任务调度平台
    java中的进制与操作符
    类再生(合成、继承、final)
    初始化
    重新学习Spring2——IOC和AOP原理彻底搞懂
    重新学习Spring一--Spring在web项目中的启动过程
    JDK并发包
    java并行程序基础
    MVC模式
    访问者模式
  • 原文地址:https://www.cnblogs.com/lijiajun/p/5491100.html
Copyright © 2020-2023  润新知