决策树
ID3 和 ID4.5
from math import log
import operator
def createDataSet():
"""
创建测试的数据集
:return:
"""
dataSet = [
['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '好瓜'],
['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '好瓜'],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '好瓜'],
['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'],
['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '坏瓜'],
['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '坏瓜'],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '坏瓜'],
['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '坏瓜'],
['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '坏瓜'],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '坏瓜'],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '坏瓜'],
['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'] ]
# 特征值列表
labels = ['色泽', '根蒂', '敲击', '纹理', '脐部', '触感']
return dataSet,labels
#id 指定某一列属性来计算熵值
def calcShannonEnt(dataSet, id):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[id] #取出类别列
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob*log(prob,2)
return shannonEnt
#取出按某个特征分类后的数据(就是把该列特征去掉)
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
'''
mode = '信息增益' or '信息增益率'
分别对应不同的模式
'''
def chooseBestFeatureToSplit(dataSet,mode):
numFeatures = len(dataSet[0])-1
baseEntropy = calcShannonEnt(dataSet,-1)
bestInfoGain = 0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob*calcShannonEnt(subDataSet,-1)
infoGain = baseEntropy - newEntropy #计算划分后的信息增益
if mode == '信息增益率':
IV = calcShannonEnt(dataSet,i)
infoGain = infoGain/IV
#print(i,value,infoGain," ",bestInfoGain)
if infoGain > bestInfoGain:
bestInfoGain = infoGain
bestFeature = i #标记最好的特征
#print("bestGain = ",bestInfoGain)
#print("
")
return bestFeature
#确定分类的类别(按分类后的类别数量排序,取数量最大的类别当做最终类别
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
'''
mode = '信息增益' or '信息增益率'
分别对应不同的模式
'''
def createTree(dataSet, labels,mode):
classList = [example[-1] for example in dataSet]
#如果只有一种类别
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(dataSet[0])==1:
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet,mode) #选择最优特征
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}} #以字典的形式保存树
del(labels[bestFeat]) #把该特征标签删掉
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet
(dataSet,bestFeat,value,),subLabels,mode)
return myTree
if __name__=='__main__':
dataSet, labels = createDataSet()
print('信息增益')
dataSet, labels = createDataSet()
print(createTree(dataSet,labels,'信息增益'))
print('信息增益率')
dataSet, labels = createDataSet()
print(createTree(dataSet,labels,'信息增益率'))
CART回归树
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 6 13:54:43 2021
@author: koneko
"""
import numpy as np
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split(' ')
fitLine = list(map(float, curLine)) #python3 里面map返回的是一个对象,需要类型转换
dataMat.append(fitLine)
return dataMat
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[np.nonzero(dataSet[:,feature]>value)[0],:]
mat1 = dataSet[np.nonzero(dataSet[:,feature]<=value)[0],:]
return mat0,mat1
def regLeaf(dataSet):
return np.mean(dataSet[:,-1])
def regErr(dataSet):
return np.var(dataSet[:,-1])*np.shape(dataSet)[0]
def chooseBestSplit(dataSet, leafType=regLeaf,errType=regErr,ops=(1,4)):
tolS = ops[0] #用户允许的误差下降值
tolN = ops[1] #切分的最少样本数
#如果所有值都相等则退出,也即是只有一种类别,不需要进行分割
if len(set(dataSet[:,-1].T.tolist()[0])) == 1:
return None, leafType(dataSet)
m,n = np.shape(dataSet)
S = errType(dataSet)
bestS = np.inf
bestIndex = 0
bestValue = 0
#遍历集合中的各种特征
for featIndex in range(n-1):
#遍历该特征在集合中出现的各种可能取值
for splitVal in set(dataSet[:,featIndex].T.A.tolist()[0]): #注意这里是Python3语法上有点不兼容
#用该取值进行二元分割
mat0, mat1 = binSplitDataSet(dataSet,featIndex,splitVal)
#如果其中一个分支的样本数少于切分的最少样本数则不切分
if (np.shape(mat0)[0]<tolN) or (np.shape(mat1)[0]<tolN):
continue
#计算切分之后的数据集的误差S
newS = errType(mat0) + errType(mat1)
#如果误差比较小则更新最小误差
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
#如果误差减少不大则退出
if (S - bestS) < tolS:
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
#如果切分出的数据集很小则退出
if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
return None, leafType(dataSet)
return bestIndex, bestValue
def createTree(dataSet,leafType=regLeaf,errType=regErr,ops=(1,4)):
feat, val = chooseBestSplit(dataSet,leafType,errType,ops)
if feat == None: #满足停止条件时返回
return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = createTree(lSet,leafType,errType,ops)
retTree['right'] = createTree(rSet,leafType,errType,ops)
return retTree
myData = loadDataSet('ex00.txt')
myMat = np.mat(myData)
tree = createTree(myMat)
print(tree)
CART分类树
from math import log
import operator
def createDataSet():
"""
创建测试的数据集
:return:
"""
dataSet = [
# 1
['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
# 2
['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
# 3
['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
# 4
['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
# 5
['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
# 6
['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '好瓜'],
# 7
['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '好瓜'],
# 8
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '好瓜'],
# ----------------------------------------------------
# 9
['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'],
# 10
['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '坏瓜'],
# 11
['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '坏瓜'],
# 12
['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '坏瓜'],
# 13
['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '坏瓜'],
# 14
['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '坏瓜'],
# 15
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '坏瓜'],
# 16
['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '坏瓜'],
# 17
['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜']
]
# 特征值列表
labels = ['色泽', '根蒂', '敲击', '纹理', '脐部', '触感']
return dataSet,labels
def calcGini(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1] #取最后一列(类别
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1 #统计每个类别的数目
Gini = 1
for key in labelCounts:
p = float(labelCounts[key])/numEntries
Gini -= p*p
return Gini
def createDataSet1(): # 创造示例数据
dataSet = [['长', '粗', '男'],
['短', '粗', '男'],
['短', '粗', '男'],
['长', '细', '女'],
['短', '细', '女'],
['短', '粗', '女'],
['长', '粗', '女'],
['长', '粗', '女']]
labels = ['头发','声音'] #两个特征
return dataSet,labels
def createDataSet2():
"""
创造示例数据/读取数据
@param dataSet: 数据集
@return dataSet labels:数据集 特征集
"""
# 数据集
dataSet = [['青年', '否', '否', '一般', '不同意'],
['青年', '否', '否', '好', '不同意'],
['青年', '是', '否', '好', '同意'],
['青年', '是', '是', '一般', '同意'],
['青年', '否', '否', '一般', '不同意'],
['中年', '否', '否', '一般', '不同意'],
['中年', '否', '否', '好', '不同意'],
['中年', '是', '是', '好', '同意'],
['中年', '否', '是', '非常好', '同意'],
['中年', '否', '是', '非常好', '同意'],
['老年', '否', '是', '非常好', '同意'],
['老年', '否', '是', '好', '同意'],
['老年', '是', '否', '好', '同意'],
['老年', '是', '否', '非常好', '同意'],
['老年', '否', '否', '一般', '不同意']]
labels = ['年龄', '有工作', '有房子', '信贷情况']
return dataSet,labels
#对某一个特征列,按照某个其是否等于value,划分成两个类
def binSplitDataSet(dataSet,index,value):
set1=[]
set2=[]
for featVec in dataSet:
reducedFeatVec = featVec[:index]
reducedFeatVec.extend(featVec[index+1:])
if featVec[index] == value:
set1.append(reducedFeatVec)
else:
set2.append(reducedFeatVec)
return set1,set2
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0])-1
nD = len(dataSet)
bestGini_feat = 100
bestFeature = -1
for feat in range(numFeatures):
featvals = [example[feat] for example in dataSet]
featvals = set(featvals)
for val in featvals:
set0,set1 = binSplitDataSet(dataSet,feat,val)
newGini_feat = (len(set0)/float(nD)) * calcGini(set0)
newGini_feat += (len(set1)/float(nD)) * calcGini(set1)
if newGini_feat < bestGini_feat:
bestGini_feat = newGini_feat
bestFeature = feat
bestVal = val
return bestFeature,bestVal
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet, labels):
classList = [a[-1] for a in dataSet]
#如果只有一个类别
if classList.count(classList[0]) == len(classList):
return classList[0]
#如果没有特征可以再分了,返回多数表决
if len(dataSet[0]) == 1:
return majorityCnt(classList)
#选择最佳特征和特征值进行分割
bestFeat,bestVal = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}} #以字典的形式保存树
del(labels[bestFeat])
mat0,mat1 = binSplitDataSet(dataSet,bestFeat,bestVal)
left = bestVal
right = set([a[bestFeat] for a in dataSet])
right.remove(bestVal)
right = tuple(right)
print(right)
subLabels = labels[:]
myTree[bestFeatLabel][left] = createTree(mat0,subLabels)
myTree[bestFeatLabel][right] = createTree(mat1,subLabels)
return myTree
dataSet, labels = createDataSet2()
myTree = createTree(dataSet,labels)
print(myTree)