▶ 朴素贝叶斯方法来进行分类
● 代码
1 import numpy as np 2 import matplotlib.pyplot as plt 3 from mpl_toolkits.mplot3d import Axes3D 4 from mpl_toolkits.mplot3d.art3d import Poly3DCollection 5 from matplotlib.patches import Rectangle 6 import operator 7 import warnings 8 9 warnings.filterwarnings("ignore") 10 dataSize = 10000 11 trainRatio = 0.3 12 13 def dataSplit(x, y, part): # 将数据集按给定索引分为两段 14 return x[:part], y[:part],x[part:],y[part:] 15 16 def myColor(x): # 颜色函数,用于对散点染色 17 r = np.select([x < 1/2, x < 3/4, x <= 1, True],[0, 4 * x - 2, 1, 0]) 18 g = np.select([x < 1/4, x < 3/4, x <= 1, True],[4 * x, 1, 4 - 4 * x, 0]) 19 b = np.select([x < 1/4, x < 1/2, x <= 1, True],[1, 2 - 4 * x, 0, 0]) 20 return [r**2,g**2,b**2] 21 22 def createData(dim, option, kind, count = dataSize): # 创建数据集,给定属性维度,每属性取值数,类别数,样本数 23 np.random.seed(103) 24 X = np.random.randint(option, size = [count, dim]) 25 if kind == 2: 26 Y = ((3 - 2 * dim) * X[:,0] + 2 * np.sum(X[:,1:], 1) > 0.5).astype(int) 27 else: 28 randomVector = np.random.rand(dim) 29 randomVector /= np.sum(randomVector) 30 Y = (np.sum(X * randomVector,1) * kind / option).astype(int) # 各类别不够均匀 31 #print(output) 32 print("dim = %d, option = %d, kind = %d, dataSize = %d"%(dim, option, kind, count)) 33 kindCount = np.zeros(kind ,dtype = int) # 各类别的占比 34 for i in range(count): 35 kindCount[Y[i]] += 1 36 for i in range(kind): 37 print("kind %d -> %4f"%(i, kindCount[i]/count)) 38 return X, Y 39 40 def naïveBayes(dataX, dataY , λ = 1): # λ 为平滑系数,0 时为极大似然估计,1 时为拉普拉斯平滑 41 count, dim = np.shape(dataX) 42 option = len(set(dataX[:,0])) # 这里默认所有属性的取值个数相同 43 kind = len(set(dataY)) # 实际可以改进为一个列表来记录 44 pY = np.zeros(kind) # 先验概率表计数 45 for i in dataY: 46 pY[i] += 1 47 pX = np.zeros([kind, dim, option]) # 条件概率表,kind 行 dim 列(和 pY 能直接相乘)option 格,小格内为该属性各取值的计数 48 for i in range(count): # 计数 49 for col in range(dim): 50 pX[dataY[i], col, dataX[i, col]] += 1 51 for row in range(kind): # 逐小格归一化,用到了 pY 计数 52 for col in range(dim): 53 pX[row,col,:] = (pX[row,col,:] + λ)/(pY[row] + option * λ) 54 55 pY = (pY + λ) / (np.sum(pY) + kind * λ) # 先验概率表归一化 56 return pX, pY 57 58 def judge(x, para): # 计算样本落入每种类别的后验概率,选择概率最大的那项 59 table = para[1].copy() # 艹,必须用浅拷贝不能用等号,否则 para[1] 会随着 table 变化 60 for i in range(len(x)): 61 table *= para[0][:, i, x[i]] 62 return np.argmax(table) 63 64 def test(dim, option, kind): 65 allX, allY = createData(dim, option, kind) 66 trainX, trainY, testX, testY = dataSplit(allX, allY, int(dataSize * trainRatio)) 67 para = naïveBayes(trainX, trainY) 68 myResult = [ judge(testX[i], para) for i in range(len(testX)) ] # 存放测试结果 69 70 errorRatio = np.sum((np.array(myResult) != testY).astype(int)) / (dataSize * (1 - trainRatio)) # 计算分类错误率 71 print("errorRatio = %4f"%errorRatio) # 离散取值,不画图了 72 73 if __name__ == '__main__': 74 test(1, 2, 2) # 1 属性 2 取值,分 2 类 75 76 test(2, 3, 2) 77 test(2, 3, 4) 78 79 test(3, 3, 2) 80 test(3, 4, 5) 81 82 test(4, 3, 2) 83 test(4, 5, 6) 84 85 test(5, 3, 6)
● 输出结果。速度非常快,可见随着属性数和每个属性的取值数的递增,分类错误率很快上涨。
dim = 1, option = 2, kind = 2, dataSize = 10000 kind 0 -> 0.498600 kind 1 -> 0.501400 errorRatio = 0.000000 dim = 2, option = 3, kind = 2, dataSize = 10000 kind 0 -> 0.446700 kind 1 -> 0.553300 errorRatio = 0.000000 dim = 2, option = 3, kind = 4, dataSize = 10000 kind 0 -> 0.336700 kind 1 -> 0.330100 kind 2 -> 0.333200 kind 3 -> 0.000000 errorRatio = 0.000000 dim = 3, option = 3, kind = 2, dataSize = 10000 kind 0 -> 0.444400 kind 1 -> 0.555600 errorRatio = 0.071714 dim = 3, option = 4, kind = 5, dataSize = 10000 kind 0 -> 0.144300 kind 1 -> 0.430700 kind 2 -> 0.315600 kind 3 -> 0.109400 kind 4 -> 0.000000 errorRatio = 0.205000 dim = 4, option = 3, kind = 2, dataSize = 10000 kind 0 -> 0.452500 kind 1 -> 0.547500 errorRatio = 0.070429 dim = 4, option = 5, kind = 6, dataSize = 10000 kind 0 -> 0.056700 kind 1 -> 0.269100 kind 2 -> 0.415800 kind 3 -> 0.229900 kind 4 -> 0.028500 kind 5 -> 0.000000 errorRatio = 0.310714 dim = 5, option = 3, kind = 6, dataSize = 10000 kind 0 -> 0.096400 kind 1 -> 0.399900 kind 2 -> 0.408000 kind 3 -> 0.092300 kind 4 -> 0.003400 kind 5 -> 0.000000 errorRatio = 0.190857