• 机器学习理论之决策树


    参考:

    基础理论

    统计学习方法 李航

    http://blog.csdn.net/ifruoxi/article/details/53081738

    http://blog.csdn.net/lsldd/article/details/41223147

    决策树(ID3、C4.5、CART、随机森林)

    https://www.2cto.com/kf/201605/509184.html  

    机器学习算法实践-树回归

     机器学习实战笔记(Python实现)-09-树回归

    写的比较好的

    http://blog.csdn.net/u014688145/article/details/53326910

    http://blog.csdn.net/u014688145/article/details/53212112

    剪枝算法

    http://blog.csdn.net/yujianmin1990/article/details/49864813

    CART算法

     决策树之CART(分类回归树)详解(公式比较清晰)

    http://blog.csdn.net/zhihua_oba/article/details/72230427  

     

    代码实现(ID3算法)

    http://blog.csdn.net/ifruoxi/article/details/53116427

    https://www.cnblogs.com/MrLJC/p/4099404.html

    # coding=utf-8
    import operator
    from math import log
    import time
    
    #https://www.cnblogs.com/MrLJC/p/4099404.html
    def createDataSet():
        dataSet = [[1, 1, 'yes'],
                   [1, 1, 'yes'],
                   [1, 0, 'no'],
                   [0, 1, 'no'],
                   [0, 1, 'no']]
        labels = ['no surfaceing', 'flippers']
        return dataSet, labels
    
    
    # 计算香农熵
    def calcShannonEnt(dataSet):
        numEntries = len(dataSet)
        labelCounts = {}
        for feaVec in dataSet:
            currentLabel = feaVec[-1]
            if currentLabel not in labelCounts:
                labelCounts[currentLabel] = 0
            labelCounts[currentLabel] += 1
        shannonEnt = 0.0
        for key in labelCounts:
            prob = float(labelCounts[key]) / numEntries
            shannonEnt -= prob * log(prob, 2)
        return shannonEnt
    
    
    def splitDataSet(dataSet, axis, value):
        retDataSet = []
        for featVec in dataSet:
            if featVec[axis] == value:
                reducedFeatVec = featVec[:axis]
                reducedFeatVec.extend(featVec[axis + 1:])
                retDataSet.append(reducedFeatVec)
        return retDataSet
    
    
    def chooseBestFeatureToSplit(dataSet):
        numFeatures = len(dataSet[0]) - 1  # 因为数据集的最后一项是标签
        baseEntropy = calcShannonEnt(dataSet)
        bestInfoGain = 0.0
        bestFeature = -1
        for i in range(numFeatures):
            featList = [example[i] for example in dataSet]
            uniqueVals = set(featList)
            newEntropy = 0.0
            for value in uniqueVals:
                subDataSet = splitDataSet(dataSet, i, value)
                prob = len(subDataSet) / float(len(dataSet))
                newEntropy += prob * calcShannonEnt(subDataSet)
            infoGain = baseEntropy - newEntropy
            if infoGain > bestInfoGain:
                bestInfoGain = infoGain
                bestFeature = i
        return bestFeature
    
    
    # 因为我们递归构建决策树是根据属性的消耗进行计算的,所以可能会存在最后属性用完了,但是分类
    # 还是没有算完,这时候就会采用多数表决的方式计算节点分类
    def majorityCnt(classList):
        classCount = {}
        for vote in classList:
            if vote not in classCount.keys():
                classCount[vote] = 0
            classCount[vote] += 1
        return max(classCount)
    
    
    def createTree(dataSet, labels):
        classList = [example[-1] for example in dataSet]
        if classList.count(classList[0]) == len(classList):  # 类别相同则停止划分
            return classList[0]
        if len(dataSet[0]) == 1:  # 所有特征已经用完
            return majorityCnt(classList)
        bestFeat = chooseBestFeatureToSplit(dataSet)
        bestFeatLabel = labels[bestFeat]
        myTree = {bestFeatLabel: {}}
        del (labels[bestFeat])
        featValues = [example[bestFeat] for example in dataSet]
        uniqueVals = set(featValues)
        for value in uniqueVals:
            subLabels = labels[:]  # 为了不改变原始列表的内容复制了一下
            myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,
                                                                   bestFeat, value), subLabels)
        return myTree
    
    
    def classify(inputTree, featLabels, testVec):
        firstStr = list(inputTree.keys())[0]
        secondDict = inputTree[firstStr]
        featIndex = featLabels.index(firstStr)
        for key in secondDict.keys():
            if testVec[featIndex] == key:
                if type(secondDict[key]).__name__ == 'dict':
                    classLabel = classify(secondDict[key], featLabels, testVec)
                else:
                    classLabel = secondDict[key]
        return classLabel
    
    
    def main():
        # data, label = createDataSet()
        # t1 = time.clock()
        # myTree = createTree(data, label)
        # t2 = time.clock()
        # print(myTree)
        # print('execute for ', t2 - t1)
    
        myDat, labels = createDataSet()
        myTree = createTree(myDat,labels)
        print(myTree)
        myDat, labels = createDataSet()
        print(classify(myTree, labels, [1, 0]))
        print(classify(myTree, labels, [1, 1]))
        print(classify(myTree, labels, [0, 1]))
    
    if __name__ == '__main__':
        main()
    View Code

    {'no surfaceing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
    no
    yes
    no

     代码实现(ID3&C4.5算法 http://blog.csdn.net/u014688145/article/details/53212112)

    main.py

    #coding=utf-8
    #http://blog.csdn.net/u014688145/article/details/53212112
    from math import log
    
    def createDataSet_NOID():
        """
        创建数据集
        """
        dataSet = [[u'青年', u'', u'', u'一般', u'拒绝'],
                    [u'青年', u'', u'', u'', u'拒绝'],
                    [u'青年', u'', u'', u'', u'同意'],
                    [u'青年', u'', u'', u'一般', u'同意'],
                    [u'青年', u'', u'', u'一般', u'拒绝'],
                    [u'中年', u'', u'', u'一般', u'拒绝'],
                    [u'中年', u'', u'', u'', u'拒绝'],
                    [u'中年', u'', u'', u'', u'同意'],
                    [u'中年', u'', u'', u'非常好', u'同意'],
                    [u'中年', u'', u'', u'非常好', u'同意'],
                    [u'老年', u'', u'', u'非常好', u'同意'],
                    [u'老年', u'', u'', u'', u'同意'],
                    [u'老年', u'', u'', u'', u'同意'],
                    [u'老年', u'', u'', u'非常好', u'同意'],
                    [u'老年', u'', u'', u'一般', u'拒绝'],
                    ]
        labels = [u'年龄', u'有工作', u'有房子', u'信贷情况']
        # 返回数据集和每个维度的名称
        return dataSet, labels
    
    def createDataSet():
        """
        创建数据集
        """
        dataSet = [[u'1000',u'青年', u'', u'', u'一般', u'拒绝'],
                    [u'2000',u'青年', u'', u'', u'', u'拒绝'],
                    [u'7000',u'青年', u'', u'', u'', u'同意'],
                    [u'7100',u'青年', u'', u'', u'一般', u'同意'],
                    [u'3000',u'青年', u'', u'', u'一般', u'拒绝'],
                    [u'3500',u'中年', u'', u'', u'一般', u'拒绝'],
                    [u'3600',u'中年', u'', u'', u'', u'拒绝'],
                    [u'8000',u'中年', u'', u'', u'', u'同意'],
                    [u'9000',u'中年', u'', u'', u'非常好', u'同意'],
                    [u'9200',u'中年', u'', u'', u'非常好', u'同意'],
                    [u'8600',u'老年', u'', u'', u'非常好', u'同意'],
                    [u'7800',u'老年', u'', u'', u'', u'同意'],
                    [u'10000',u'老年', u'', u'', u'', u'同意'],
                    [u'6500',u'老年', u'', u'', u'非常好', u'同意'],
                    [u'3000',u'老年', u'', u'', u'一般', u'拒绝'],
                    ]
        labels = [u'工资',u'年龄', u'有工作', u'有房子', u'信贷情况']
        # 返回数据集和每个维度的名称
        return dataSet, labels
    
    def calcInformationGainRate(dataSet, baseEntropy, i):
        """
        计算信息增益比
        :param dataSet: 数据集
        :param baseEntropy: 数据集中Y的信息熵
        :param i: 特征维度i
        :return: 特征i对数据集的信息增益g(dataSet|X_i)
        """
        numEntries = len(dataSet)
        labelCounts = {}
        for featVec in dataSet:
            currentLabel = featVec[i]
            if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
            labelCounts[currentLabel] += 1
        shannonEnt = 0.0
        for key in labelCounts:
            prob = float(labelCounts[key]) / numEntries
            shannonEnt -= prob * log(prob, 2)
    
        return calcInformationGain(dataSet, baseEntropy, i) / shannonEnt
    
    
    def chooseBestFeatureToSplitByC45(dataSet):
        """
        选择最好的数据集划分方式
        :param dataSet:
        :return:
        """
        numFeatures = len(dataSet[0]) - 1  # 最后一列是分类
        baseEntropy = calcShannonEnt(dataSet)
        bestInfoGainRate = 0.0
        bestFeature = -1
        for i in range(numFeatures):
            infoGainRate = calcInformationGainRate(dataSet, baseEntropy, i)
            if (infoGainRate > bestInfoGainRate):
                bestInfoGainRate = infoGainRate
                bestFeature = i
        return bestFeature
    
    def splitDataSet(dataSet,axis,value):
        """
        按照给定特征划分数据集
        :param axis:划分数据集的特征的维度
        :param value:特征的值
        :return: 符合该特征的所有实例(并且自动移除掉这维特征)
        """
    
        # 循环遍历dataSet中的每一行数据
        retDataSet = []
        for featVec in dataSet:
            if featVec[axis] == value:
                reduceFeatVec = featVec[:axis] # 删除这一维特征
                reduceFeatVec.extend(featVec[axis+1:])
                retDataSet.append(reduceFeatVec)
        return retDataSet
    
    # 计算的始终是类别标签的不确定度
    def calcShannonEnt(dataSet):
        """
        计算训练数据集中的Y随机变量的香农熵
        :param dataSet:
        :return:
        """
        numEntries = len(dataSet) # 实例的个数
        labelCounts = {}
        for featVec in dataSet: # 遍历每个实例,统计标签的频次
            currentLabel = featVec[-1] # 表示最后一列
            # 当前标签不在labelCounts map中,就让labelCounts加入该标签
            if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] =0
            labelCounts[currentLabel] +=1
    
        shannonEnt = 0.0
        for key in labelCounts:
            prob = float(labelCounts[key]) / numEntries
            shannonEnt -= prob * log(prob,2) # log base 2
        return shannonEnt
    
    def calcConditionalEntropy(dataSet,i,featList,uniqueVals):
        """
        计算x_i给定的条件下,Y的条件熵
        :param dataSet: 数据集
        :param i: 维度i
        :param featList: 数据集特征列表
        :param unqiueVals: 数据集特征集合
        :return: 条件熵
        """
        ce = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet,i,value)
            prob = len(subDataSet) / float(len(dataSet)) # 极大似然估计概率
            ce += prob * calcShannonEnt(subDataSet) #∑pH(Y|X=xi) 条件熵的计算
        return ce
    
    def calcInformationGain(dataSet,baseEntropy,i):
        """
        计算信息增益
        :param dataSet: 数据集
        :param baseEntropy: 数据集中Y的信息熵
        :param i: 特征维度i
        :return: 特征i对数据集的信息增益g(dataSet | X_i)
        """
        featList = [example[i] for example in dataSet] # 第i维特征列表
        uniqueVals = set(featList) # 换成集合 - 集合中的每个元素不重复
        newEntropy = calcConditionalEntropy(dataSet,i,featList,uniqueVals)
        infoGain = baseEntropy - newEntropy # 信息增益
        return infoGain
    
    def chooseBestFeatureToSplitByID3(dataSet):
        """
        选择最好的数据集划分
        :param dataSet:
        :return:
        """
        numFeatures = len(dataSet[0]) -1 # 最后一列是分类
        baseEntropy = calcShannonEnt(dataSet)
        bestInfoGain = 0.0
        bestFeature = -1
        for i in range(numFeatures): # 遍历所有维度特征
            infoGain = calcInformationGain(dataSet,baseEntropy,i)
            if(infoGain > bestInfoGain):
                bestInfoGain = infoGain
                bestFeature = i
        return bestFeature # 返回最佳特征对应的维度
    
    # 因为我们递归构建决策树是根据属性的消耗进行计算的,所以可能会存在最后属性用完了,但是分类
    # 还是没有算完,这时候就会采用多数表决的方式计算节点分类
    def majorityCnt(classList):
        classCount = {}
        for vote in classList:
            if vote not in classCount.keys():
                classCount[vote] = 0
            classCount[vote] += 1
        return max(classCount)
    
    def createTree(dataSet,labels,chooseBestFeatureToSplitFunc = chooseBestFeatureToSplitByID3):
        """
        创建决策树
        :param dataSet: 数据集
        :param labels: 数据集每一维的名称
        :return: 决策树
        """
        classList = [example[-1] for example in dataSet] # 类别列表
        if classList.count(classList[0]) == len(classList): # 统计属于列别classList[0]的个数
            return classList[0] # 当类别完全相同则停止继续划分
        if len(dataSet[0]) ==1: # 当只有一个特征的时候,遍历所有实例返回出现次数最多的类别
            return majorityCnt(classList) # 返回类别标签
        bestFeat = chooseBestFeatureToSplitFunc(dataSet)
        bestFeatLabel = labels[bestFeat]
        myTree ={bestFeatLabel:{}}  # map 结构,且key为featureLabel
        del (labels[bestFeat])
        # 找到需要分类的特征子集
        featValues = [example[bestFeat] for example in dataSet]
        uniqueVals = set(featValues)
        for value in uniqueVals:
            subLabels = labels[:] # 复制操作
            myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels, chooseBestFeatureToSplitFunc)
        return myTree
    
    
    # # 测试决策树的构建
    # dataSet,labels = createDataSet()
    # myTree = createTree(dataSet,labels)
    # print(myTree)
    
    # 测试决策树的构建
    dataSet,labels = createDataSet()
    myTree = createTree(dataSet,labels,chooseBestFeatureToSplitByC45)
    
    from pylab import *
    mpl.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体
    mpl.rcParams['axes.unicode_minus'] = False  # 解决保存图像时负号'-'显示为方块的问题
    
    # 绘制决策树
    import treePlotter
    treePlotter.createPlot(myTree)
    View Code
    treePlotter.py
    #coding=utf-8
    import matplotlib.pyplot as plt
    
    # 定义文本框和箭头格式
    decisionNode = dict(boxstyle="round4", color='#3366FF')  #定义判断结点形态
    leafNode = dict(boxstyle="circle", color='#FF6633')  #定义叶结点形态
    arrow_args = dict(arrowstyle="<-", color='g')  #定义箭头
    
    #绘制带箭头的注释
    def plotNode(nodeTxt, centerPt, parentPt, nodeType):
        createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',
                                xytext=centerPt, textcoords='axes fraction',
                                va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)
    
    
    #计算叶结点数
    def getNumLeafs(myTree):
        numLeafs = 0
        firstStr = list(myTree.keys())[0]
        secondDict = myTree[firstStr]
        for key in secondDict.keys():
            if type(secondDict[key]).__name__ == 'dict':
                numLeafs += getNumLeafs(secondDict[key])
            else:
                numLeafs += 1
        return numLeafs
    
    
    #计算树的层数
    def getTreeDepth(myTree):
        maxDepth = 0
        firstStr = list(myTree.keys())[0]
        secondDict = myTree[firstStr]
        for key in secondDict.keys():
            if type(secondDict[key]).__name__ == 'dict':
                thisDepth = 1 + getTreeDepth(secondDict[key])
            else:
                thisDepth = 1
            if thisDepth > maxDepth:
                maxDepth = thisDepth
        return maxDepth
    
    
    #在父子结点间填充文本信息
    def plotMidText(cntrPt, parentPt, txtString):
        xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
        yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
        createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)
    
    
    def plotTree(myTree, parentPt, nodeTxt):
        numLeafs = getNumLeafs(myTree)
        depth = getTreeDepth(myTree)
        firstStr = list(myTree.keys())[0]
        cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff)
        plotMidText(cntrPt, parentPt, nodeTxt)  #在父子结点间填充文本信息
        plotNode(firstStr, cntrPt, parentPt, decisionNode)  #绘制带箭头的注释
        secondDict = myTree[firstStr]
        plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
        for key in secondDict.keys():
            if type(secondDict[key]).__name__ == 'dict':
                plotTree(secondDict[key], cntrPt, str(key))
            else:
                plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW
                plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
                plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
        plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD
    
    
    def createPlot(inTree):
        fig = plt.figure(1, facecolor='white')
        fig.clf()
        axprops = dict(xticks=[], yticks=[])
        createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
        plotTree.totalW = float(getNumLeafs(inTree))
        plotTree.totalD = float(getTreeDepth(inTree))
        plotTree.xOff = -0.5 / plotTree.totalW;
        plotTree.yOff = 1.0;
        plotTree(inTree, (0.5, 1.0), '')
        plt.show()
    View Code

    代码实现(CART算法 分类树)

    #coding=utf-8
    #http://blog.csdn.net/u014688145/article/details/53212112
    from math import log
    
    def createDataSet():
        """
        创建数据集
        """
        dataSet = [[u'青年', u'', u'', u'一般', u'拒绝'],
                    [u'青年', u'', u'', u'', u'拒绝'],
                    [u'青年', u'', u'', u'', u'同意'],
                    [u'青年', u'', u'', u'一般', u'同意'],
                    [u'青年', u'', u'', u'一般', u'拒绝'],
                    [u'中年', u'', u'', u'一般', u'拒绝'],
                    [u'中年', u'', u'', u'', u'拒绝'],
                    [u'中年', u'', u'', u'', u'同意'],
                    [u'中年', u'', u'', u'非常好', u'同意'],
                    [u'中年', u'', u'', u'非常好', u'同意'],
                    [u'老年', u'', u'', u'非常好', u'同意'],
                    [u'老年', u'', u'', u'', u'同意'],
                    [u'老年', u'', u'', u'', u'同意'],
                    [u'老年', u'', u'', u'非常好', u'同意'],
                    [u'老年', u'', u'', u'一般', u'拒绝'],
                    ]
        labels = [u'年龄', u'有工作', u'有房子', u'信贷情况']
        # 返回数据集和每个维度的名称
        return dataSet, labels
    
    def createDataSet_ID():
        """
        创建数据集
        """
        dataSet = [[u'1000',u'青年', u'', u'', u'一般', u'拒绝'],
                    [u'2000',u'青年', u'', u'', u'', u'拒绝'],
                    [u'7000',u'青年', u'', u'', u'', u'同意'],
                    [u'7100',u'青年', u'', u'', u'一般', u'同意'],
                    [u'3000',u'青年', u'', u'', u'一般', u'拒绝'],
                    [u'3500',u'中年', u'', u'', u'一般', u'拒绝'],
                    [u'3600',u'中年', u'', u'', u'', u'拒绝'],
                    [u'8000',u'中年', u'', u'', u'', u'同意'],
                    [u'9000',u'中年', u'', u'', u'非常好', u'同意'],
                    [u'9200',u'中年', u'', u'', u'非常好', u'同意'],
                    [u'8600',u'老年', u'', u'', u'非常好', u'同意'],
                    [u'7800',u'老年', u'', u'', u'', u'同意'],
                    [u'10000',u'老年', u'', u'', u'', u'同意'],
                    [u'6500',u'老年', u'', u'', u'非常好', u'同意'],
                    [u'3000',u'老年', u'', u'', u'一般', u'拒绝'],
                    ]
        labels = [u'工资',u'年龄', u'有工作', u'有房子', u'信贷情况']
        # 返回数据集和每个维度的名称
        return dataSet, labels
    
    # 计算数据集的基尼指数
    def calcGini(dataSet):
        numEntries = len(dataSet)
        labelCounts ={}
        # 给所有可能分类创建字典
        for featVec in dataSet:
            currentLabel = featVec[-1]
            if currentLabel not in labelCounts.keys():
                labelCounts[currentLabel] =0
            labelCounts[currentLabel]+=1
        Gini =1.0
        for key in labelCounts:
            prob = float(labelCounts[key])/numEntries
            Gini -= prob * prob
        return Gini
    
    
    def splitOtherDataSetByValue(dataSet, axis, value):
        """
    按照给定特征划分数据集
    :param axis:划分数据集的特征的维度
    :param value:特征的值
    :return: 不符合该特征的所有实例(并且自动移除掉这维特征)
    """
        # 循环遍历dataSet中的每一行数据
        retDataSet = []
        # 找寻 axis下某个特征的非空子集
        for featVec in dataSet:
            if featVec[axis] != value:
                reduceFeatVec = featVec[:axis]  # 删除这一维特征
                reduceFeatVec.extend(featVec[axis + 1:])
                retDataSet.append(reduceFeatVec)
        return retDataSet
    
    def splitDataSet(dataSet,axis,value):
        """
        按照给定特征划分数据集
        :param axis:划分数据集的特征的维度
        :param value:特征的值
        :return: 符合该特征的所有实例(并且自动移除掉这维特征)
        """
    
        # 循环遍历dataSet中的每一行数据
        retDataSet = []
        # 找寻 axis下某个特征的非空子集
        for featVec in dataSet:
            if featVec[axis] == value:
                reduceFeatVec = featVec[:axis] # 删除这一维特征
                reduceFeatVec.extend(featVec[axis+1:])
                retDataSet.append(reduceFeatVec)
        return retDataSet
    
    def chooseBestFeatureToSplitByCART(dataSet):
        numFeatures = len(dataSet[0]) -1
        bestGiniIndex = 1000000.0
        bestSplictValue =[]
        bestFeature = -1
        # 计算Gini指数
        for i in range(numFeatures):
            featList = [example[i] for example in dataSet]
            # 这里只针对离散变量 & 特征标签
            uniqueVals = set(featList)
            bestGiniCut = 1000000.0
            bestGiniCutValue =[]
            Gini_value =0.0
            # 计算在该特征下每种划分的基尼指数,并且用字典记录当前特征的最佳划分点
            for value in uniqueVals:
                # 计算subDataSet的基尼指数
                subDataSet = splitDataSet(dataSet,i,value)
                prob = len(subDataSet) / float(len(dataSet))
                Gini_value = prob * calcGini(subDataSet)
                # 计算otherDataSet的基尼指数
                otherDataSet = splitOtherDataSetByValue(dataSet,i,value)
                prob = len(otherDataSet) / float(len(dataSet))
                Gini_value = Gini_value + prob * calcGini(otherDataSet)
                # 选择最优切分点
                if Gini_value < bestGiniCut:
                    bestGiniCut = Gini_value
                    bestGiniCutValue = value
    
            # 选择最优特征向量
            GiniIndex = bestGiniCut
            if GiniIndex < bestGiniIndex:
                bestGiniIndex = GiniIndex
                bestSplictValue = bestGiniCutValue
                bestFeature = i
                print(bestFeature,bestSplictValue)
    
        # 若当前结点的划分结点特征中的标签超过3个,则将其以之前记录的划分点为界进行二值化处理
        binaryZationDataSet(bestFeature,bestSplictValue,dataSet)
        return bestFeature
    
    # 因为我们递归构建决策树是根据属性的消耗进行计算的,所以可能会存在最后属性用完了,但是分类
    # 还是没有算完,这时候就会采用多数表决的方式计算节点分类
    def majorityCnt(classList):
        """
        返回出现次数最多的分类名称
        :param classList: 类列表
        :retrun: 出现次数最多的类名称
        """
    
        classCount = {}
        for vote in classList:
            if vote not in classCount.keys(): classCount[vote] = 0
            classCount[vote] +=1
        sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse = True)
        return sortedClassCount[0][0]
    
    def binaryZationDataSet(bestFeature, bestSplitValue, dataSet):
        # 求特征标签数
        featList = [example[bestFeature] for example in dataSet]
        uniqueValues = set(featList)
    
        # 特征标签输超过2,对数据集进行二值划分 为了看出决策树构造时的区别,这里特征标签为2时也进行处理
        if len(uniqueValues) >= 2:
            for i in range(len(dataSet)):
                if dataSet[i][bestFeature] == bestSplitValue:  # 不做处理
                    pass
                else:
                    dataSet[i][bestFeature] = '其他'
    
    def createTree(dataSet,labels,chooseBestFeatureToSplitFunc = chooseBestFeatureToSplitByCART):
        """
        创建决策树
        :param dataSet: 数据集
        :param labels: 数据集每一维的名称
        :return: 决策树
        """
        classList = [example[-1] for example in dataSet] # 类别列表
        if classList.count(classList[0]) == len(classList): # 统计属于列别classList[0]的个数
            return classList[0] # 当类别完全相同则停止继续划分
        if len(dataSet[0]) ==1: # 当只有一个特征的时候,遍历所有实例返回出现次数最多的类别
            return majorityCnt(classList) # 返回类别标签
        bestFeat = chooseBestFeatureToSplitFunc(dataSet)
        bestFeatLabel = labels[bestFeat]
        myTree ={bestFeatLabel:{}}  # map 结构,且key为featureLabel
        del (labels[bestFeat])
        # 找到需要分类的特征子集
        featValues = [example[bestFeat] for example in dataSet]
        uniqueVals = set(featValues)
        for value in uniqueVals:
            subLabels = labels[:] # 复制操作
            myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels, chooseBestFeatureToSplitFunc)
        return myTree
    
    
    # # 测试决策树的构建
    # dataSet,labels = createDataSet()
    # myTree = createTree(dataSet,labels)
    # print(myTree)
    
    # 测试决策树的构建
    dataSet,labels = createDataSet()
    myTree = createTree(dataSet,labels,chooseBestFeatureToSplitByCART)
    
    from pylab import *
    mpl.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体
    mpl.rcParams['axes.unicode_minus'] = False  # 解决保存图像时负号'-'显示为方块的问题
    
    # 绘制决策树
    import treePlotter
    treePlotter.createPlot(myTree)
    View Code
  • 相关阅读:
    vue实现图片路径传送
    title中添加小图标
    张钊的第一份作业
    张钊的第二份作业
    在Windows Server 2008 R2环境下安装活动目录失败的一个解决方法
    如何把SubVersion的服务程序变为Window后台服务形式
    一个关于静态方法调用的问题。
    WCF配置中遇到的问题:如何把Hostname修改成IP
    删除Visual Studio最近的项目(转载)
    有时候用ifstream或ofstream打开带有中文路径的文件会失败
  • 原文地址:https://www.cnblogs.com/adong7639/p/8320623.html
Copyright © 2020-2023  润新知