1、信息熵
一些概念
p(x):分类结果x的概率,即分类结果为x的数据量/总数据量
信息:l(x) = -log2(p(x))
信息熵:信息的期望值 -(p(x1)l(x1) + p(x2)l(x2) + ……)
计算信息熵
1 def calcShannonEnt(dataset): 2 numEntries = len(dataset) 3 labelCounts = {} 4 for featVec in dataset: 5 currentLabel = featVec[-1] 6 if currentLabel not in labelCounts.keys(): 7 labelCounts[currentLabel] = 0 8 labelCounts[currentLabel] += 1 9 shannonEnt = 0.0 10 for key in labelCounts: 11 prob = float(labelCounts[key])/numEntries 12 shannonEnt -= prob * log(prob,2) 13 return shannonEnt
2、按给定特征划分数据集
1 # axis 特征 , value 给定的该特征的值 2 def splitDataSet(dataSet , axis , value): 3 retDataSet = [] 4 for featVec in dataSet: 5 if featVec[axis] == value: 6 reducedFeatVec = featVec[:axis] 7 ''' 8 b = [1,2] 9 a = [1,2] 10 b.append(a) 函数: 往列表b里面添加元素a: 11 结果: b = [1,2,[1,2]] 12 b.extend(a) 函数: 用列表a扩张列表b: 13 结果: b = [1,2,1,2] 14 ''' 15 reducedFeatVec.extend(featVec[axis+1:]) 16 retDataSet.append(reducedFeatVec) 17 return retDataSet
3、寻找划分数据集的最好特征
1 def chooseBestFeatureToSplit(dataset): 2 numFeatures = len(dataset[0]) - 1 3 baseEntropy = calcShannonEnt(dataset) 4 bestInfoGain = 0.0 5 bestFeature = -1 6 numDatas = len(dataset) 7 for i in range(numFeatures): 8 featList = [example[i] for example in dataset] # 第i列 9 uniqueVals = set(featList) # uniqueValsl里面保存着第i个特征的所有可能的取值 10 newEntropy = 0.0 11 for value in uniqueVals: 12 subDataSet = splitDataSet(dataset,i,value) 13 prob = float(len(subDataSet))/numDatas 14 newEntropy += prob*calcShannonEnt(subDataSet) # 求划分后信息熵的期望 15 #信息熵可以表现数据的无序性,所以划分后信息熵的期望越小越好 16 infoGain = baseEntropy - newEntropy 17 if (infoGain > bestInfoGain): 18 bestInfoGain = infoGain 19 bestFeature = i 20 return bestFeature
4、决策树算法
算法说明
创建决策树算法 createTree():
if 数据集中的每个子项的类别完全相同:
return 该类别
if 遍历完所有的特征:
return 频率最高的类别
寻找划分数据集的最好特征
创建分支节点
划分数据集
for 每个划分的子集
调用createTree()并且增加返回结果到分支节点中
return 分支节点
频率最高的类别函数
1 def majorityCnt(classList): 2 classCount = {} 3 for vote in classList: 4 if vote not in classCount.keys(): 5 classCount[vote] = 0 6 classCount[vote] += 1 7 sortedclassCount = sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True) 8 return sortedclassCount[0][0]
创建决策树
def createTree(dataSet,labels): classList = [example[-1] for example in dataSet] #if 数据集中的每个子项的类别完全相同:return 该类别 if(classList.count(classList[0]) == len(classList)): return classList[0] #if 遍历完所有的特征:return 频率最高的类别 if(len(dataSet[0]) == 1): return majorityCnt(classList) #寻找划分数据集的最好特征 bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] #创建分支节点 myTree = {bestFeatLabel:{}} del(labels[bestFeat]) #划分数据集 featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) #for 每个划分的子集 for value in uniqueVals: #调用createTree()并且增加返回结果到分支节点中 sublabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),sublabels) #return 分支节点 return myTree