• 《西瓜书》第四章,决策树2:连续值,缺失值,画图


    ▶ 继续改进决策树的代码,对连续型变量和含缺失值的变量进行分类,以及画图的代码(头一回采用递归的画图,Python 的 plt 到处随便调不用考虑句柄传递的问题,很爽)。

    ● 代码,仅注释了与简单决策树不同的地方

      1 import numpy as np
      2 import matplotlib.pyplot as plt
      3 import operator
      4 import warnings
      5 
      6 warnings.filterwarnings("ignore")                           
      7 dataSize = 1000
      8 trainRatio = 0.3
      9 randomSeed = 107
     10 
     11 def dataSplit(data, part):                                  
     12     return data[0:part], data[part:]
     13 
     14 def kernel(x, i, n):                                                                    # 用来分类的分段函数
     15     return np.select([x < 1-i/n, True], [i*x/(n-i), n*(x-1)/i-x+2])
     16 
     17 def createData(dim, kind, len):                                                         # 取连续值,没有 option 选项
     18     np.random.seed(randomSeed)        
     19     temp = np.random.rand(len, dim)
     20     x = np.sum(temp[:,:-1],1) / (dim - 1)    
     21     if kind == 2:                                                                       
     22         f = temp[:,-1] > x * (32 / 3 * (x-1) * (x-1/2) + 1)                             # 过 (0,0),(1/4,3/4),(1/2,1/2),(1,1) 的三次曲线        
     23     elif kind == 3:
     24         f = (temp[:,-1] > x ** 2).astype(int) + (temp[:,-1] > 1 - (1-x)**2).astype(int) # 把单位正方形三等分了的两条抛物线        
     25     else:   
     26         f = np.zeros(len)                                                               
     27         fi = np.frompyfunc(kernel, 3, 1)                                                # 让 kernel 能接受向量输入
     28         for i in range(1, kind):                                                        # 在单位正方形不过原点的对角线上等距取点
     29             f += temp[:,-1] > fi(x,i,kind).astype(float)                                # 分别连接 (0,0) 和 (1,1),划分区域
     30         
     31     output = [ temp[i].tolist() + [str(f[i])] for i in range(len) ]
     32     label = [ chr(i + 65) for i in range(dim) ]
     33     #for line in output:
     34     #    print(line)
     35     print("dim = %d, kind = %d, dataSize = %d, weightedMean = %4f"%(dim, kind, dataSize, np.sum(f.astype(int)) / (len *(kind-1))))    
     36     return output, label
     37 
     38 def plogp(x, isScalar):                                 
     39     output = x * np.log2(x)
     40     if isScalar:
     41         return [0 if np.isnan(output) else output][0]
     42     output[np.isnan(output)] = 0.0
     43     return output
     44 
     45 def calculateGain(table, alpha = 0):                                                    # 公式跟离散情形一模一样                    
     46     sumC = np.sum(table, 0)                             
     47     sumR = np.sum(table, 1)                             
     48     sumA = np.sum(sumC)                                 
     49     temp = -( np.sum(plogp(sumC,False)) - plogp(sumA,True) - np.sum(plogp(table,False)) + np.sum(plogp(sumR,False)) ) / sumA
     50     if alpha == 0:
     51         return temp
     52     elif alpha == 1:        
     53         return temp * (-1.0 / np.sum(plogp(sumR / sumA,False)))
     54     else:
     55         return sumA / ( np.sum(sumR * (1 - np.sum(table * table, 0) / (sumR * sumR))) ) 
     56 
     57 def chooseFeature(data,label):                                              
     58     size, dim = np.shape(data)
     59     dim -= 1    
     60     realMaxGain = 0  
     61     realPoint = -1
     62     maxi = -1            
     63     kindTable = list(set([ data[j][-1] for j in range(size) ]))
     64     for i in range(dim):                                                    
     65         valueTable = [ data[j][i] for j in range(size) ]
     66         sortedValueTable = sorted(valueTable)
     67         tSet = [ (sortedValueTable[i] + sortedValueTable[i+1])/2 for i in range(size-1)]# 分点集合               
     68         maxGain = 0
     69         point = -1
     70         for t in tSet:                                                                  # 尝试每个分点
     71             table = np.zeros([2,2],dtype=int)              
     72             for j in range(size):
     73                 table[int(data[j][-1] == kindTable[0]),int(data[j][i] < t)] += 1
     74             gain = calculateGain(table)                                         
     75             if (gain > maxGain):                                                        # 内部关于 t 找一次最大值
     76                 maxGain = gain
     77                 point = t    
     78         if (maxGain > realMaxGain):                                                     # 外部关于 i 找一次最大值
     79             realMaxGain = maxGain
     80             maxi = i
     81             realPoint = point                        
     82     return (maxi, realPoint)                                                            # 不再返回最佳属性的取值表,而是返回分点
     83 
     84 def vote(kindList):                                                         
     85     kindCount = {}
     86     for i in kindList:
     87         if i not in kindCount.keys():
     88             kindCount[i] = 0
     89         kindCount[i] += 1    
     90     output = sorted(kindCount.items(),key=operator.itemgetter(1),reverse = True)    
     91     return output[0][0]                                                             
     92             
     93 def createTree(data,label):                                                         
     94     #if data == []:                                  
     95     #    return '?'                          
     96     if len(data[0]) == 1:                           
     97         return vote([ line[-1] for line in data ])
     98     classList = set([i[-1] for i in data])          
     99     if len(classList) == 1:                         
    100         return list(classList)[0]
    101         
    102     bestFeature, point = chooseFeature(data, label)            
    103     bestLabel = label[bestFeature]                                  
    104     myTree = {(bestLabel,point):{}}                                  # 树中属性节点,附上分点数据                                                 
    105     
    106     childData = []                                                   # 分左右两半进行递归,0 表示小于分点的数据,1 表示大于分点的数据
    107     for line in data:                                                # 只有两个分支,做循环展开
    108         if line[bestFeature] <= point:                                          
    109             childData.append(line)
    110     myTree[(bestLabel,point)][0] = createTree(childData, label)    
    111     childData = []                                               
    112     for line in data:                                           
    113         if line[bestFeature] > point:                                          
    114             childData.append(line)
    115     myTree[(bestLabel,point)][1] = createTree(childData, label)
    116     return myTree
    117 
    118 def draw(xMin, xMax, yMin, yMax, nowTree,kindType):   
    119     #plt.plot([xMin,xMax],[yMin,yMin],color=[1,1,1])
    120     #plt.plot([xMin,xMax],[yMax,yMax],color=[1,1,1])
    121     #plt.plot([xMin,xMin],[yMin,yMax],color=[1,1,1])
    122     #plt.plot([xMax,xMax],[yMin,yMax],color=[1,1,1])
    123     direction,value = list(nowTree)[0]        
    124     if(direction)=='A':                                                     # 画竖线
    125         plt.plot([value,value],[yMin,yMax],color=[0,0,0])        
    126         branch0,branch1 = list(nowTree.values())[0].values()
    127         if type(branch0) == kindType:                                       # 左支
    128             plt.text((xMin+value)/2,(yMin+2*yMax)/3, str(branch0[0]), 
    129                 size = 9, ha="center", va="center", bbox=dict(boxstyle="round", ec=(1., 0.5, 0.5), fc=(1., 1., 1.)))
    130         else:
    131             draw(xMin, value, yMin, yMax, branch0, kindType)
    132         if type(branch1) == kindType:                                       # 右支
    133             plt.text((xMax+value)/2,(2*yMin+yMax)/3, str(branch1[0]), 
    134                 size = 9, ha="center", va="center", bbox=dict(boxstyle="round", ec=(1., 0.5, 0.5), fc=(1., 1., 1.)))
    135         else:
    136             draw(value, xMax, yMin, yMax, branch1, kindType)
    137     else:                                                                   # 画横线
    138         plt.plot([xMin,xMax],[value,value],color=[0,0,0])
    139         branch0,branch1 = list(nowTree.values())[0].values()
    140         if type(branch0) == kindType:                                       # 下支
    141             plt.text((xMin+2*xMax)/3,(yMin+value)/2, str(branch0[0]), 
    142                 size = 9, ha="center", va="center", bbox=dict(boxstyle="round", ec=(1., 0.5, 0.5), fc=(1., 1., 1.)))
    143         else:
    144             draw(xMin, xMax, yMin, value, branch0, kindType)
    145         if type(branch1) == kindType:                                       # 上支
    146             plt.text((2*xMin+xMax)/3,(yMax+value)/2, str(branch1[0]), 
    147                 size = 9, ha="center", va="center", bbox=dict(boxstyle="round", ec=(1., 0.5, 0.5), fc=(1., 1., 1.)))
    148         else:
    149             draw(xMin, xMax, value, yMax, branch1, kindType)    
    150 
    151 def test(dim, kind):                                                
    152     allData, labelName = createData(dim, kind, dataSize)            
    153     trainData, testData = dataSplit(allData, int(dataSize * trainRatio)) 
    154     outputTree = createTree(trainData, labelName)                           
    155     print(outputTree)                                                      
    156     
    157     myResult = []                                       
    158     #count = 0                                          
    159     for line in testData:
    160         #print(count)
    161         tempTree = outputTree                           
    162         while(True):                                                        
    163             judgeName = list(tempTree)[0]
    164             judgeValue = list(tempTree.values())[0]            
    165             value = line[labelName.index(judgeName[0])]        # 取属性节点的属性名                
    166             resultNow = judgeValue[int(value > judgeName[1])]  # 取属性节点的分点值来做比较
    167             if type(resultNow) == type(allData[0][-1]):
    168                 myResult.append(resultNow)
    169                 break
    170             tempTree = resultNow    
    171         #count+=1
    172 
    173     fig = plt.figure(figsize=(10, 8))    
    174     plt.xlim(0.0,1.0)
    175     plt.ylim(-0.0,1.0)
    176     xT = []
    177     xF = []
    178     yT = []
    179     yF = []
    180     for i in range(len(testData)):
    181         if testData[i][-1] == 'True':
    182             xT.append(testData[i][0])            
    183             yT.append(testData[i][1])
    184         else:
    185             xF.append(testData[i][0])
    186             yF.append(testData[i][1])        
    187     plt.scatter(xT,yT,color=[1,0,0],label = "classT")
    188     plt.scatter(xF,yF,color=[0,0,1],label = "classF")
    189     plt.legend(loc=[0.85, 0.1], ncol=1, numpoints=1, framealpha = 1)
    190     draw(0.0,1.0,0.0,1.0,outputTree,type(allData[0][-1]))    
    191     fig.savefig("R:\dim" + str(dim) + ".png")
    192     plt.close()
    193     print("errorRatio = %4f"%( sum(map(lambda x,y:int(x!=y[-1]), myResult, testData)) / (dataSize*(1-trainRatio)) ))    
    194 
    195 if __name__=='__main__':    
    196     test(2, 2)           
    197     #test(2, 2)
    198     #test(3, 2)
    199     #test(4, 2)    

    ● 输出结果(数字精度砍掉了,不然太长了)

    {('B',0.3032): {0: {('A',0.0408): {0: {('A',0.0352): {0: 'False',
                                                          1: 'True'
                                                         }
                                          },
                                       1: 'False'
                                      }
                       },
                    1: {('B',0.6816): {0: {('A',0.5020): {0: {('A',0.0797): {0: {('B',0.3171): {0: 'False',
                                                                                                1: 'True'
                                                                                               }
                                                                                },
                                                                             1: {('B',0.6223): {0: {('A',0.4835): {0: 'False',
                                                                                                                   1: {('A',0.4932): {0: 'True',
                                                                                                                                      1: 'False'
                                                                                                                                     }
                                                                                                                      }
                                                                                                                  }
                                                                                                   },
                                                                                                1: {('A',0.3897): {0: {('A',0.1309): {0: 'True',
                                                                                                                                      1: 'False'
                                                                                                                                     }
                                                                                                                      },
                                                                                                                   1: 'True'
                                                                                                                  }
                                                                                                   }
                                                                                               }
                                                                                }
                                                                            }
                                                             },
                                                          1: {('A',0.9160): {0: {('A',0.5407): {0: {('A',0.5327): {0: 'True',
                                                                                                                   1: 'False'
                                                                                                                  }
                                                                                                   },
                                                                                                1: 'True'
                                                                                               }
                                                                                },
                                                                             1: 'False'
                                                                            }
                                                             }
                                                         }
                                          },
                                       1: {('A',0.9450): {0: {('B',0.7335): {0: {('B',0.7262): {0: 'True',
                                                                                                1: 'False'
                                                                                               }
                                                                                },
                                                                             1: 'True'
                                                                            }
                                                             },
                                                          1: 'False'
                                                         }
                                          }
                                      }
                       }
                   }
    }
     errorRatio = 0.054286

    ● 画图

     ● 有缺失值时,在函数 chooseFeature 中为表格 table 增加一行来保存分属各类别的样本频数,带入以下函数中计算增益

     1 def calculateGain(table, alpha = 0):                                                    # 有缺失值情况,table 多一行来保存分属各类别的样本频数
     2     sumC = np.sum(table[:-1], 0)                                                        # 行列求和不包括缺失值的行
     3     sumR = np.sum(table[:-1], 1)
     4     sumA = np.sum(sumC)
     5     temp = -( np.sum(plogp(sumC,False)) - plogp(sumA,True) - np.sum(plogp(table,False)) + np.sum(plogp(sumR,False)) ) / (sumA + np.sum(table[-1]))  # 总分母要算上缺失行的频数,就算是乘以了 ρ  
     6     if alpha == 0:
     7         return temp
     8     elif alpha == 1:
     9         return temp * (-1.0 / np.sum(plogp(sumR / sumA,False)))
    10     else:
    11         return sumA / ( np.sum(sumR * (1 - np.sum(table * table, 0) / (sumR * sumR))) )

     ● 留坑,类别数大于 2 时的的画图函数

  • 相关阅读:
    【转】JS对Cookie的读写删除
    【转】【Python】 python中的编码问题报错 'ascii' codec can't decode 及 URL地址获取中文
    【转】【Python】Python中的__init__.py与模块导入(from import 找不到模块的问题)
    【转】【Centos】nginx配置:location配置方法及实例详解
    【转】【Html】Vuejs2.0学习之二(Render函数,createElement,vm.$slots,函数化组件,模板编译,JSX)
    【Html】Vue动态插入组件
    【HTML】div居中显示
    神奇的bug,退出时自动更新时间
    curl Array to string conversion 错误
    PHP可变参数
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/11126556.html
Copyright © 2020-2023  润新知