• 决策树(挖坑待填)


    决策树

    ID3 和 ID4.5

    from math import log
    import operator
    
    def createDataSet():
        """
        创建测试的数据集
        :return:
        """
        dataSet = [
            ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
            ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
            ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
            ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
            ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
            ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '好瓜'],
            ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '好瓜'],
            ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '好瓜'],
            ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'],
            ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '坏瓜'],
            ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '坏瓜'],
            ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '坏瓜'],
            ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '坏瓜'],
            ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '坏瓜'],
            ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '坏瓜'],
            ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '坏瓜'],
            ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'] ]
        # 特征值列表
        labels = ['色泽', '根蒂', '敲击', '纹理', '脐部', '触感']
        return dataSet,labels
    
    
    #id 指定某一列属性来计算熵值
    def calcShannonEnt(dataSet, id):
        numEntries = len(dataSet)
        labelCounts = {}
        for featVec in dataSet:
            currentLabel = featVec[id]  #取出类别列
            if currentLabel not in labelCounts.keys():
                labelCounts[currentLabel] = 0
            labelCounts[currentLabel] += 1
        shannonEnt = 0
        for key in labelCounts:
            prob = float(labelCounts[key])/numEntries
            shannonEnt -= prob*log(prob,2)
        return shannonEnt
        
    #取出按某个特征分类后的数据(就是把该列特征去掉)
    def splitDataSet(dataSet, axis, value):
        retDataSet = []
        for featVec in dataSet:
            if featVec[axis] == value:
                reducedFeatVec = featVec[:axis]
                reducedFeatVec.extend(featVec[axis+1:])
                retDataSet.append(reducedFeatVec)
        return retDataSet
    
    '''
    mode = '信息增益' or '信息增益率'
    分别对应不同的模式
    '''
    def chooseBestFeatureToSplit(dataSet,mode):
        numFeatures = len(dataSet[0])-1
        baseEntropy = calcShannonEnt(dataSet,-1)
        bestInfoGain = 0
        bestFeature = -1
        for i in range(numFeatures):
            featList = [example[i] for example in dataSet]
            uniqueVals = set(featList)
            newEntropy = 0
            for value in uniqueVals:
                subDataSet = splitDataSet(dataSet, i, value)
                prob = len(subDataSet)/float(len(dataSet))
                newEntropy += prob*calcShannonEnt(subDataSet,-1)
            infoGain = baseEntropy - newEntropy  #计算划分后的信息增益
            if mode == '信息增益率':
                IV = calcShannonEnt(dataSet,i)   
                infoGain = infoGain/IV
            #print(i,value,infoGain," ",bestInfoGain)
            if infoGain > bestInfoGain:
                bestInfoGain = infoGain
                bestFeature = i  #标记最好的特征
                
        #print("bestGain = ",bestInfoGain)
        #print("
    ")
        return bestFeature
    
    
    #确定分类的类别(按分类后的类别数量排序,取数量最大的类别当做最终类别
    def majorityCnt(classList):
        classCount = {}
        for vote in classList:                                                 
            if vote not in classCount.keys():
                classCount[vote] = 0
            classCount[vote] += 1
        sortedClassCount =  sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
        return sortedClassCount[0][0]
    
    '''
    mode = '信息增益' or '信息增益率'
    分别对应不同的模式
    '''
    def createTree(dataSet, labels,mode):
        classList = [example[-1] for example in dataSet]
        #如果只有一种类别
        if classList.count(classList[0]) == len(classList):
            return classList[0]
        if len(dataSet[0])==1:
            return majorityCnt(classList)
        bestFeat = chooseBestFeatureToSplit(dataSet,mode)  #选择最优特征
        bestFeatLabel = labels[bestFeat]
        myTree = {bestFeatLabel:{}}  #以字典的形式保存树
        del(labels[bestFeat])  #把该特征标签删掉
        featValues = [example[bestFeat] for example in dataSet]
        uniqueVals = set(featValues)
        for value in uniqueVals:
            subLabels = labels[:]
            myTree[bestFeatLabel][value] = createTree(splitDataSet
                                  (dataSet,bestFeat,value,),subLabels,mode)
        return myTree
    
    
    if __name__=='__main__':
        dataSet, labels = createDataSet()
        print('信息增益')
        dataSet, labels = createDataSet()
        print(createTree(dataSet,labels,'信息增益'))
        print('信息增益率')
        dataSet, labels = createDataSet()
        print(createTree(dataSet,labels,'信息增益率'))
    

    CART回归树

    # -*- coding: utf-8 -*-
    """
    Created on Wed Jan  6 13:54:43 2021
    
    @author: koneko
    """
    
    import numpy as np
    
    def loadDataSet(fileName):
        dataMat = []
        fr = open(fileName)
        for line in fr.readlines():
            curLine = line.strip().split('	')
            fitLine = list(map(float, curLine)) #python3 里面map返回的是一个对象,需要类型转换
            dataMat.append(fitLine)
        return dataMat
    
    def binSplitDataSet(dataSet, feature, value):
        mat0 = dataSet[np.nonzero(dataSet[:,feature]>value)[0],:]
        mat1 = dataSet[np.nonzero(dataSet[:,feature]<=value)[0],:]
        return mat0,mat1
    
    def regLeaf(dataSet):
        return np.mean(dataSet[:,-1])
    
    def regErr(dataSet):
        return np.var(dataSet[:,-1])*np.shape(dataSet)[0]
    
    def chooseBestSplit(dataSet, leafType=regLeaf,errType=regErr,ops=(1,4)):
        tolS = ops[0]  #用户允许的误差下降值
        tolN = ops[1]  #切分的最少样本数
        #如果所有值都相等则退出,也即是只有一种类别,不需要进行分割
        if len(set(dataSet[:,-1].T.tolist()[0])) == 1:
            return None, leafType(dataSet)
        m,n = np.shape(dataSet)
        S = errType(dataSet)
        bestS = np.inf
        bestIndex = 0
        bestValue = 0
        #遍历集合中的各种特征
        for featIndex in range(n-1):
            #遍历该特征在集合中出现的各种可能取值
            for splitVal in set(dataSet[:,featIndex].T.A.tolist()[0]):  #注意这里是Python3语法上有点不兼容
                #用该取值进行二元分割
                mat0, mat1 = binSplitDataSet(dataSet,featIndex,splitVal)
                #如果其中一个分支的样本数少于切分的最少样本数则不切分
                if (np.shape(mat0)[0]<tolN) or (np.shape(mat1)[0]<tolN):
                    continue
                #计算切分之后的数据集的误差S
                newS = errType(mat0) + errType(mat1)
                #如果误差比较小则更新最小误差
                if newS < bestS:
                    bestIndex = featIndex
                    bestValue = splitVal
                    bestS = newS
        #如果误差减少不大则退出
        if (S - bestS) < tolS:
            return None, leafType(dataSet)
        mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
        #如果切分出的数据集很小则退出
        if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
            return None, leafType(dataSet)
        return bestIndex, bestValue
    
    def createTree(dataSet,leafType=regLeaf,errType=regErr,ops=(1,4)):
        feat, val = chooseBestSplit(dataSet,leafType,errType,ops)
        if feat == None:  #满足停止条件时返回
            return val
        retTree = {}
        retTree['spInd'] = feat
        retTree['spVal'] = val
        lSet, rSet = binSplitDataSet(dataSet, feat, val)
        retTree['left'] = createTree(lSet,leafType,errType,ops)
        retTree['right'] = createTree(rSet,leafType,errType,ops)
        return retTree
        
    myData = loadDataSet('ex00.txt')
    myMat = np.mat(myData)
    tree = createTree(myMat)
    print(tree)
    
    

    CART分类树

    from math import log
    import operator
    
    def createDataSet():
        """
        创建测试的数据集
        :return:
        """
        dataSet = [
            # 1
            ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
            # 2
            ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
            # 3
            ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
            # 4
            ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
            # 5
            ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
            # 6
            ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '好瓜'],
            # 7
            ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '好瓜'],
            # 8
            ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '好瓜'],
    
            # ----------------------------------------------------
            # 9
            ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'],
            # 10
            ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '坏瓜'],
            # 11
            ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '坏瓜'],
            # 12
            ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '坏瓜'],
            # 13
            ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '坏瓜'],
            # 14
            ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '坏瓜'],
            # 15
            ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '坏瓜'],
            # 16
            ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '坏瓜'],
            # 17
            ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜']
        ]
    
        # 特征值列表
        labels = ['色泽', '根蒂', '敲击', '纹理', '脐部', '触感']
        return dataSet,labels
    
    def calcGini(dataSet):
        numEntries = len(dataSet)
        labelCounts = {}
        for featVec in dataSet:
            currentLabel = featVec[-1] #取最后一列(类别
            if currentLabel not in labelCounts.keys():
                labelCounts[currentLabel] = 0
            labelCounts[currentLabel] += 1  #统计每个类别的数目
        Gini = 1
        for key in labelCounts:
            p = float(labelCounts[key])/numEntries
            Gini -= p*p
        return Gini
    
    
    def createDataSet1():    # 创造示例数据
        dataSet = [['长', '粗', '男'],
                   ['短', '粗', '男'],
                   ['短', '粗', '男'],
                   ['长', '细', '女'],
                   ['短', '细', '女'],
                   ['短', '粗', '女'],
                   ['长', '粗', '女'],
                   ['长', '粗', '女']]
        labels = ['头发','声音']  #两个特征
        return dataSet,labels
    
    def createDataSet2():
        """
        创造示例数据/读取数据
        @param dataSet: 数据集
        @return dataSet labels:数据集 特征集
        """
        # 数据集
        dataSet = [['青年', '否', '否', '一般', '不同意'],
                   ['青年', '否', '否', '好', '不同意'],
                   ['青年', '是', '否', '好', '同意'],
                   ['青年', '是', '是', '一般', '同意'],
                   ['青年', '否', '否', '一般', '不同意'],
                   ['中年', '否', '否', '一般', '不同意'],
                   ['中年', '否', '否', '好', '不同意'],
                   ['中年', '是', '是', '好', '同意'],
                   ['中年', '否', '是', '非常好', '同意'],
                   ['中年', '否', '是', '非常好', '同意'],
                   ['老年', '否', '是', '非常好', '同意'],
                   ['老年', '否', '是', '好', '同意'],
                   ['老年', '是', '否', '好', '同意'],
                   ['老年', '是', '否', '非常好', '同意'],
                   ['老年', '否', '否', '一般', '不同意']]
        labels = ['年龄', '有工作', '有房子', '信贷情况']
        return dataSet,labels
    
    #对某一个特征列,按照某个其是否等于value,划分成两个类
    def binSplitDataSet(dataSet,index,value):
        set1=[]
        set2=[]
        for featVec in dataSet:
            reducedFeatVec = featVec[:index]
            reducedFeatVec.extend(featVec[index+1:])
            if featVec[index] == value:
                set1.append(reducedFeatVec)
            else:
                set2.append(reducedFeatVec)
        return set1,set2
    
              
    def chooseBestFeatureToSplit(dataSet):
        numFeatures = len(dataSet[0])-1
        nD = len(dataSet)
        bestGini_feat = 100
        bestFeature = -1
        for feat in range(numFeatures):
            featvals = [example[feat] for example in dataSet]
            featvals = set(featvals)
            for val in featvals:
                set0,set1 = binSplitDataSet(dataSet,feat,val)
                newGini_feat = (len(set0)/float(nD)) * calcGini(set0)
                newGini_feat += (len(set1)/float(nD)) * calcGini(set1)
                if newGini_feat < bestGini_feat:
                    bestGini_feat = newGini_feat
                    bestFeature = feat
                    bestVal = val
        return bestFeature,bestVal
                     
    
    def majorityCnt(classList):
        classCount = {}
        for vote in classList:
            if vote not in classCount.keys():
                classCount[vote] = 0
            classCount[vote] += 1
        sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
        return sortedClassCount[0][0]
          
    
    def createTree(dataSet, labels):
        classList = [a[-1] for a in dataSet]
        #如果只有一个类别
        if classList.count(classList[0]) == len(classList):
            return classList[0]
        #如果没有特征可以再分了,返回多数表决
        if len(dataSet[0]) == 1:
            return majorityCnt(classList)
        #选择最佳特征和特征值进行分割
        bestFeat,bestVal = chooseBestFeatureToSplit(dataSet)
        bestFeatLabel = labels[bestFeat]
        myTree = {bestFeatLabel:{}} #以字典的形式保存树
        del(labels[bestFeat])
        mat0,mat1 = binSplitDataSet(dataSet,bestFeat,bestVal)
        left = bestVal
        right = set([a[bestFeat] for a in dataSet])
        right.remove(bestVal)
        right = tuple(right)
        print(right)
        subLabels = labels[:]
        myTree[bestFeatLabel][left] = createTree(mat0,subLabels)
        myTree[bestFeatLabel][right] = createTree(mat1,subLabels)
        
        return myTree
        
    dataSet, labels = createDataSet2()
    
    myTree = createTree(dataSet,labels)
    print(myTree)
    
  • 相关阅读:
    通用人工智能离我们还有多远?
    自动化机器上的物联网网关的目的是什么?
    提效降本,您不可不知道的云架构秘诀
    “影子物联网”:日益增长的企业安全盲区
    查找练习 hash——出现过的数字
    查找练习 hash——出现过的数字
    数据结构上机实验之二分查找
    数据结构上机实验之二分查找
    简单字符串比较
    简单字符串比较
  • 原文地址:https://www.cnblogs.com/urahyou/p/14245850.html
Copyright © 2020-2023  润新知