▶ 使用决策树来为散点作分类。很多博客上的代码都是从《机器学习实战》上抄的,比如【https://www.cnblogs.com/jishengyu/p/8274991.html】,没有测试函数、没有树的泛化测试过程、没有节点默认值,甚至树本身就是错的,在增加属性数、属性取值数、类别数以后会出现对某些样本无法决策现象。
● 简单决策树,旧代码,能完成目标,但是没有前后剪枝、没有连续值和缺失值处理,没有把 X 和 Y 分离开,混合数据类型列表,函数 plogp 效率低下,各种不是 bug 的问题,还不知道为什么注释也没了,非常垃圾。
1 import numpy as np 2 import operator 3 import warnings 4 5 warnings.filterwarnings("ignore") 6 dataSize = 10000 7 trainRatio = 0.3 8 9 def dataSplit(data, part): 10 return data[0:part], data[part:] 11 12 def createData(dim, option, kind, len): 13 np.random.seed(103) 14 temp = np.zeros([len,dim+1]) 15 16 for i in range(dim): 17 temp[:,i] = np.random.randint(option, size = len) 18 if kind == 2: 19 temp[:,dim] = list(map(lambda x : int(x > 1), (3 - 2 * dim)*temp[:,0] + 2 * np.sum(temp[:,1:dim], 1))) 20 else: 21 temp[:,dim] = list(map(lambda x : int(x > 1), (3 - 2 * dim)*temp[:,0] + 2 * np.sum(temp[:,1:dim], 1))) 22 23 output = [ line[0:dim].tolist() + [str(line[-1] == 1.0)] for line in temp ] 24 label = [ chr(i + 65) for i in range(dim) ] 25 26 print("dim = %d, option = %d, kind = %d, dataSize = %d, class1Ratio = %4f"%(dim, option, kind, dataSize, (np.sum(temp[:,-1]) / len))) 27 return output, label 28 29 def plogp(x, isScalar): 30 output = x * np.log2(x) 31 if isScalar: 32 return [0 if np.isnan(output) else output][0] 33 output[np.isnan(output)] = 0.0 34 return output 35 36 def calculateGain(table, alpha = 0): 37 sumC = np.sum(table, 0) 38 sumR = np.sum(table, 1) 39 sumA = np.sum(sumC) 40 temp = -( np.sum(plogp(sumC,False)) - plogp(sumA,True) - np.sum(plogp(table,False)) + np.sum(plogp(sumR,False)) ) / sumA 41 if alpha == 0: 42 return temp 43 elif alpha == 1: 44 return temp * (-1.0 / np.sum(plogp(sumR / sumA,False))) 45 else: 46 return sumA / ( np.sum(sumR * (1 - np.sum(table * table, 0) / (sumR * sumR))) ) 47 48 def chooseFeature(data,label): 49 size, dim = np.shape(data) 50 dim -= 1 51 maxGain = 0 52 maxi = -1 53 kindTable = list(set([ data[j][-1] for j in range(size) ])) 54 for i in range(dim): 55 valueTable = list(set([ data[j][i] for j in range(size) ])) 56 table = np.zeros([len(valueTable),len(kindTable)]) 57 for line in data: 58 table[valueTable.index(line[i]),kindTable.index(line[-1])] += 1 59 gain = calculateGain(table) 60 if (gain > maxGain): 61 maxGain = gain 62 maxi = i 63 valueTable = list(set([ data[j][maxi] for j in range(size) ])) 64 return (maxi, valueTable) 65 66 def vote(kindList): 67 kindCount = {} 68 for i in kindList: 69 if i not in kindCount.keys(): 70 kindCount[i] = 0 71 kindCount[i] += 1 72 output = sorted(kindCount.items(),key=operator.itemgetter(1),reverse = True) 73 return output[0][0] 74 75 def createTree(data,label): 76 if data == []: 77 return '?' 78 if len(data[0]) == 1: 79 return vote([ line[-1] for line in data ]) 80 classList = set([i[-1] for i in data]) 81 if len(classList) == 1: 82 return list(classList)[0] 83 84 bestFeature, valueTable = chooseFeature(data, label) 85 bestLabel = label[bestFeature] 86 myTree = {bestLabel:{}} 87 myTree[bestLabel]['Fuck'] = vote([ line[-1] for line in data]) 88 89 subLabel = label[:bestFeature]+label[bestFeature + 1:] 90 for value in valueTable: 91 childData = [] 92 for line in data: 93 if line[bestFeature] == value: 94 childData.append(line[:bestFeature] + line[bestFeature + 1:]) 95 myTree[bestLabel][value] = createTree(childData, subLabel) 96 return myTree 97 98 def test(dim, option, kind): 99 allData, labelName = createData(dim, option, kind, dataSize) 100 trainData, testData = dataSplit(allData, int(dataSize * trainRatio)) 101 outputTree = createTree(trainData, labelName) 102 103 myResult = [] 104 105 for line in testData: 106 tempTree = outputTree 107 while(True): 108 judgeName = list(tempTree)[0] 109 judgeValue = list(tempTree.values())[0] 110 value = line[labelName.index(judgeName)] 111 if value not in judgeValue : 112 myResult.append(judgeValue['Fuck']) 113 break 114 resultNow = judgeValue[value] 115 if type(resultNow) == type(allData[0][-1]): 116 myResult.append(resultNow) 117 break 118 tempTree = resultNow 119 120 print("errorRatio = %4f"%( sum(map(lambda x,y:int(x!=y[-1]), myResult, testData)) / (dataSize*(1-trainRatio)) )) 121 122 if __name__=='__main__': 123 test(3, 3, 2) 124 test(4, 3, 2) 125 test(5, 4, 2) 126 test(6, 5, 2)
● 重构后的代码,可用作以后的模板
1 import numpy as np 2 import operator 3 import warnings 4 5 warnings.filterwarnings("ignore") 6 dataSize = 10000 7 trainRatio = 0.3 8 9 def dataSplit(x, y, part): # 将数据集按给定索引分为两段 10 return x[:part], y[:part],x[part:],y[part:] 11 12 def myColor(x): # 颜色函数,用于对散点染色 13 r = np.select([x < 1/2, x < 3/4, x <= 1, True],[0, 4 * x - 2, 1, 0]) 14 g = np.select([x < 1/4, x < 3/4, x <= 1, True],[4 * x, 1, 4 - 4 * x, 0]) 15 b = np.select([x < 1/4, x < 1/2, x <= 1, True],[1, 2 - 4 * x, 0, 0]) 16 return [r,g,b] 17 18 def createData(dim, option, kind, count = dataSize): # 创建数据集,给定属性维度,每属性取值数,类别数,样本数 19 np.random.seed(103) 20 X = np.random.randint(option, size = [count,dim]) 21 if kind == 2: 22 Y = ((3 - 2 * dim)*X[:,0] + 2 * np.sum(X[:,1:], 1) > 0.5).astype(int) # 单独列出方便观察,可以合并到 else 中 23 else: 24 randomVector = np.random.rand(dim) 25 randomVector /= np.sum(randomVector) 26 Y = (np.sum(X * randomVector,1) * kind / option).astype(int) # 各类别不够均匀 27 label = [ chr(i + 65) for i in range(dim) ] # 属性名,用 'A','B','C',... 28 29 #print(output) 30 print("dim = %d, option = %d, kind = %d, dataSize = %d"%(dim, option, kind, count)) 31 kindCount = np.zeros(kind ,dtype = int) # 各类别的占比 32 for i in range(count): 33 kindCount[Y[i]] += 1 34 for i in range(kind): 35 print("kind %d -> %4f"%(i, kindCount[i]/count)) 36 return X, Y, label 37 38 def plogp(x): # 计算熵,把 nan 转 0 39 return np.nan_to_num( x * np.log2(x)) 40 41 def calculateGain(table, alpha = 0): # 计算增益 42 sumC = np.sum(table, 0) 43 sumR = np.sum(table, 1) 44 sumA = np.sum(sumC) 45 if alpha ==0: # 增益,化简过的式子 46 return -( np.sum(plogp(sumC)) + np.sum(plogp(sumR)) - np.sum(plogp(table)) - plogp(sumA) ) / sumA 47 elif alpha == 1: # 增益率 48 return (np.sum(plogp(sumC)) - plogp(sumA)) / (np.sum(plogp(sumR)) - plogp(sumA)) - 1 49 else: # Gini 系数的倒数(因为 Gini 越小越好) 50 return sumA / ( np.sum(sumR * (1 - np.sum(table * table, 0) / (sumR * sumR))) ) 51 52 def chooseFeature(dataX, dataY, label): # 选择最优属性 53 count, dim = np.shape(dataX) 54 maxGain = 0 55 maxi = -1 56 kindTable = list(set(dataY)) # 类别表 57 for i in range(dim): 58 valueTable = list(set([ dataX[j][i] for j in range(count) ])) # 属性取值表 59 table = np.zeros([len(valueTable),len(kindTable)]) # 生成用于计算熵的表格 60 for j in range(count): 61 table[valueTable.index(dataX[j][i]),kindTable.index(dataY[j])] += 1 62 gain = calculateGain(table) # 计算并记录最大增益的属性 63 if (gain > maxGain): 64 maxGain = gain 65 maxi = i 66 valueTable = list(set([ dataX[j][maxi] for j in range(count) ])) 67 return (maxi, valueTable) 68 69 def vote(kindList): # 根据列别表进行投票 70 kindCount = {} 71 for i in kindList: 72 if i not in kindCount.keys(): 73 kindCount[i] = 0 74 kindCount[i] += 1 75 output = sorted(kindCount.items(),key=operator.itemgetter(1),reverse = True) 76 return output[0][0] 77 78 def createTree(dataX, dataY, label): # 以当前数据创建决策树 79 #if dataX == []: # 数据为空情况,下面使用 value 和 valueTable 防止进入空子树,不会进入该分支 80 # return '?' 81 if len(dataX[0]) == 0: # 属性已经取完,剩余数据进行投票 82 return vote(dataY) 83 if len(set(dataY)) == 1: # 所有元素属于一类,返回该类 84 return dataY[0] 85 86 bestFeature, valueTable = chooseFeature(dataX, dataY, label) # 获取最佳属性索引及其取值表 87 bestLabel = label[bestFeature] 88 myTree = {bestLabel:{}} # 当前节点 89 myTree[bestLabel]['Fuck'] = vote(dataY) # 该节点的默认值,当样本取值不在任何子树中时使用 90 91 subLabel = label[:bestFeature] + label[bestFeature + 1:] # 属性表中挖掉刚选出的属性 92 for value in valueTable: 93 childX = [] 94 childY = [] 95 for i in range(len(dataX)): 96 line = dataX[i] 97 if line[bestFeature] == value: # 按最佳属性的取值将数据分类,分别计算子树 98 childX.append(np.concatenate([line[:bestFeature],line[bestFeature + 1:]])) 99 childY.append(dataY[i]) 100 myTree[bestLabel][value] = createTree(childX, childY, subLabel) 101 return myTree # 返回当前节点 102 103 def test(dim, option, kind): 104 allX, allY, labelName = createData(dim, option, kind) 105 trainX, trainY, testX, testY = dataSplit(allX, allY, int(dataSize * trainRatio)) 106 outputTree = createTree(trainX, trainY, labelName) 107 108 myResult = [] # 存放测试结果 109 110 for line in testX: 111 tempTree = outputTree # 递归的搜索决策树 112 while(True): 113 judgeName = list(tempTree)[0] # 当前节点属性名 114 judgeValue = list(tempTree.values())[0] # 当前节点属性取值表 115 value = line[labelName.index(judgeName)] # 当前样本该属性的取值 116 if value not in judgeValue : # 样本取值不在表中,使用节点默认值 117 myResult.append(judgeValue['Fuck']) 118 break 119 resultNow = judgeValue[value] 120 if type(resultNow) == type(testY[0]): # 找到了叶节点,返回叶节点的类别,即分类结果 121 myResult.append(resultNow) 122 break 123 tempTree = resultNow 124 125 errorRatio = np.sum((np.array(myResult) != testY).astype(int)) / (dataSize*(1 - trainRatio)) # 计算分类错误率 126 print("errorRatio = %4f"%errorRatio ) 127 128 if __name__=='__main__': 129 test(1, 2, 2) 130 test(1, 3, 2) 131 test(2, 2, 2) 132 test(2, 3, 2) 133 test(3, 3, 2) 134 test(4, 3, 2) 135 test(5, 4, 2) 136 test(6, 5, 2) 137 test(3, 3, 3) 138 test(4, 4, 3) 139 test(5, 4, 4)
● 用于计算熵的表格,下标表示不同属性值,上标表示不同类别,n 表示指定属性值和类别下的样本频数,sumC 和 sumR 分别为列求和和行求和,sumA 是全元素总和
● 在总数据量 100 的情况下输出的决策树(手动调整缩进方便观看):
{'0': {'Fuck': 'False', 0.0: {'1': {'Fuck': 'True', 0.0: {'2': {'Fuck': 'True', 0.0: {'3': {'Fuck': 'True', 0.0: 'False', 2.0: 'True' } }, 1.0: 'True' } }, 1.0: 'True', 2.0: 'True' } }, 1.0: {'1': {'Fuck': 'False', 0.0: 'False', 2.0: 'True' } }, 2.0: 'False' } }
● 直接运行的输出结果,可见各维度越高,犯错概率也变得越高。画图都是整点的散点图,不放了。
dim = 1, option = 2, kind = 2, dataSize = 10000 kind 0 -> 0.498600 kind 1 -> 0.501400 errorRatio = 0.000000 dim = 1, option = 3, kind = 2, dataSize = 10000 kind 0 -> 0.339500 kind 1 -> 0.660500 errorRatio = 0.000000 dim = 2, option = 2, kind = 2, dataSize = 10000 kind 0 -> 0.497400 kind 1 -> 0.502600 errorRatio = 0.000000 dim = 2, option = 3, kind = 2, dataSize = 10000 kind 0 -> 0.446700 kind 1 -> 0.553300 errorRatio = 0.000000 dim = 3, option = 3, kind = 2, dataSize = 10000 kind 0 -> 0.444400 kind 1 -> 0.555600 errorRatio = 0.000000 dim = 4, option = 3, kind = 2, dataSize = 10000 kind 0 -> 0.452500 kind 1 -> 0.547500 errorRatio = 0.000000 dim = 5, option = 4, kind = 2, dataSize = 10000 kind 0 -> 0.468000 kind 1 -> 0.532000 errorRatio = 0.005857 dim = 6, option = 5, kind = 2, dataSize = 10000 kind 0 -> 0.464400 kind 1 -> 0.535600 errorRatio = 0.075286 dim = 3, option = 3, kind = 3, dataSize = 10000 kind 0 -> 0.485300 kind 1 -> 0.476600 kind 2 -> 0.038100 errorRatio = 0.000000 dim = 4, option = 4, kind = 3, dataSize = 10000 kind 0 -> 0.405300 kind 1 -> 0.568100 kind 2 -> 0.026600 errorRatio = 0.000000 dim = 5, option = 4, kind = 4, dataSize = 10000 kind 0 -> 0.207800 kind 1 -> 0.584400 kind 2 -> 0.207200 kind 3 -> 0.000600 errorRatio = 0.007571
● 调包,使用 sklearn 的 tree 模块,封装得连爹都不认识了,采用的数据集跟上面的代码完全相同
1 import numpy as np 2 import warnings 3 from sklearn import tree 4 from sklearn.metrics import classification_report 5 6 warnings.filterwarnings("ignore") 7 dataSize = 10000 8 trainRatio = 0.3 9 10 def test(dim, option, kind, count = dataSize): 11 np.random.seed(103) 12 X = np.random.randint(option, size = [count,dim]) 13 randomVector = np.random.rand(dim) 14 randomVector /= np.sum(randomVector) 15 Y = (np.sum(X * randomVector,1) * kind / option).astype(int) # 各类别不够均匀 16 print("dim = %d, option = %d, kind = %d, dataSize = %d"%(dim, option, kind, dataSize)) 17 kindCount = np.zeros(kind ,dtype = int) # 各类别的占比 18 for i in range(count): 19 kindCount[Y[i]] += 1 20 for i in range(kind): 21 print("kind %d -> %4f"%(i, kindCount[i]/count)) 22 23 trainSize = int(dataSize * trainRatio) 24 testSize = dataSize - trainSize 25 26 trainX = X[:trainSize] 27 trainY = Y[:trainSize] 28 clf = tree.DecisionTreeClassifier() # 创建决策树 29 clf = clf.fit(trainX, trainY) # 训练 30 31 testX = X[trainSize:] 32 testY = Y[trainSize:] 33 myResult = clf.predict(testX) # 对测试数据进行分类 34 35 print("errorRatio = %4f, accracy = %4f"%( sum(map(lambda x,y:int(x!=y), myResult, testY)) / testSize, clf.score(testX,testY) )) 36 # 我的错误率和 sklearn 自带的正确率,和恒为 1 37 print(classification_report(myResult,testY,target_names = [ chr(i + 65) for i in range(kind) ])) 38 # 自带的一个检测报告 39 if __name__=='__main__': 40 test(1, 2, 2) 41 test(1, 3, 2) 42 test(2, 2, 2) 43 test(2, 3, 2) 44 test(3, 3, 2) 45 test(4, 3, 2) 46 test(5, 4, 2) 47 test(6, 5, 2) 48 test(3, 3, 3) 49 test(4, 4, 3) 50 test(5, 4, 4)
● 输出结果,感觉速度比我的更快,精度也比我的高一些
dim = 1, option = 2, kind = 2, dataSize = 10000 kind 0 -> 0.498600 kind 1 -> 0.501400 errorRatio = 0.000000, accracy = 1.000000 precision recall f1-score support A 1.00 1.00 1.00 3496 B 1.00 1.00 1.00 3504 accuracy 1.00 7000 macro avg 1.00 1.00 1.00 7000 weighted avg 1.00 1.00 1.00 7000 dim = 1, option = 3, kind = 2, dataSize = 10000 kind 0 -> 0.672700 kind 1 -> 0.327300 errorRatio = 0.000000, accracy = 1.000000 precision recall f1-score support A 1.00 1.00 1.00 4739 B 1.00 1.00 1.00 2261 accuracy 1.00 7000 macro avg 1.00 1.00 1.00 7000 weighted avg 1.00 1.00 1.00 7000 dim = 2, option = 2, kind = 2, dataSize = 10000 kind 0 -> 0.747900 kind 1 -> 0.252100 errorRatio = 0.000000, accracy = 1.000000 precision recall f1-score support A 1.00 1.00 1.00 5207 B 1.00 1.00 1.00 1793 accuracy 1.00 7000 macro avg 1.00 1.00 1.00 7000 weighted avg 1.00 1.00 1.00 7000 dim = 2, option = 3, kind = 2, dataSize = 10000 kind 0 -> 0.666800 kind 1 -> 0.333200 errorRatio = 0.000000, accracy = 1.000000 precision recall f1-score support A 1.00 1.00 1.00 4652 B 1.00 1.00 1.00 2348 accuracy 1.00 7000 macro avg 1.00 1.00 1.00 7000 weighted avg 1.00 1.00 1.00 7000 dim = 3, option = 3, kind = 2, dataSize = 10000 kind 0 -> 0.776400 kind 1 -> 0.223600 errorRatio = 0.000000, accracy = 1.000000 precision recall f1-score support A 1.00 1.00 1.00 5448 B 1.00 1.00 1.00 1552 accuracy 1.00 7000 macro avg 1.00 1.00 1.00 7000 weighted avg 1.00 1.00 1.00 7000 dim = 4, option = 3, kind = 2, dataSize = 10000 kind 0 -> 0.853400 kind 1 -> 0.146600 errorRatio = 0.000000, accracy = 1.000000 precision recall f1-score support A 1.00 1.00 1.00 5950 B 1.00 1.00 1.00 1050 accuracy 1.00 7000 macro avg 1.00 1.00 1.00 7000 weighted avg 1.00 1.00 1.00 7000 dim = 5, option = 4, kind = 2, dataSize = 10000 kind 0 -> 0.792200 kind 1 -> 0.207800 errorRatio = 0.003429, accracy = 0.996571 precision recall f1-score support A 1.00 1.00 1.00 5560 B 0.99 0.99 0.99 1440 accuracy 1.00 7000 macro avg 0.99 1.00 0.99 7000 weighted avg 1.00 1.00 1.00 7000 dim = 6, option = 5, kind = 2, dataSize = 10000 kind 0 -> 0.775900 kind 1 -> 0.224100 errorRatio = 0.065000, accracy = 0.935000 precision recall f1-score support A 0.96 0.96 0.96 5447 B 0.85 0.86 0.85 1553 accuracy 0.94 7000 macro avg 0.90 0.91 0.91 7000 weighted avg 0.94 0.94 0.94 7000 dim = 3, option = 3, kind = 3, dataSize = 10000 kind 0 -> 0.485300 kind 1 -> 0.476600 kind 2 -> 0.038100 errorRatio = 0.000000, accracy = 1.000000 precision recall f1-score support A 1.00 1.00 1.00 3403 B 1.00 1.00 1.00 3329 C 1.00 1.00 1.00 268 accuracy 1.00 7000 macro avg 1.00 1.00 1.00 7000 weighted avg 1.00 1.00 1.00 7000 dim = 4, option = 4, kind = 3, dataSize = 10000 kind 0 -> 0.405300 kind 1 -> 0.568100 kind 2 -> 0.026600 errorRatio = 0.000000, accracy = 1.000000 precision recall f1-score support A 1.00 1.00 1.00 2840 B 1.00 1.00 1.00 3967 C 1.00 1.00 1.00 193 accuracy 1.00 7000 macro avg 1.00 1.00 1.00 7000 weighted avg 1.00 1.00 1.00 7000 dim = 5, option = 4, kind = 4, dataSize = 10000 kind 0 -> 0.207800 kind 1 -> 0.584400 kind 2 -> 0.207200 kind 3 -> 0.000600 errorRatio = 0.005714, accracy = 0.994286 precision recall f1-score support A 0.99 1.00 0.99 1447 B 1.00 0.99 1.00 4113 C 0.99 0.99 0.99 1438 D 1.00 1.00 1.00 2 accuracy 0.99 7000 macro avg 0.99 1.00 1.00 7000 weighted avg 0.99 0.99 0.99 7000