• 《统计学习方法》第四章,朴素贝叶斯


    ▶ 朴素贝叶斯方法来进行分类

    ● 代码

     1 import numpy as np
     2 import matplotlib.pyplot as plt
     3 from mpl_toolkits.mplot3d import Axes3D
     4 from mpl_toolkits.mplot3d.art3d import Poly3DCollection
     5 from matplotlib.patches import Rectangle
     6 import operator
     7 import warnings
     8 
     9 warnings.filterwarnings("ignore")
    10 dataSize = 10000
    11 trainRatio = 0.3
    12 
    13 def dataSplit(x, y, part):                                                          # 将数据集按给定索引分为两段
    14     return x[:part], y[:part],x[part:],y[part:]
    15 
    16 def myColor(x):                                                                     # 颜色函数,用于对散点染色
    17     r = np.select([x < 1/2, x < 3/4, x <= 1, True],[0, 4 * x - 2, 1, 0])
    18     g = np.select([x < 1/4, x < 3/4, x <= 1, True],[4 * x, 1, 4 - 4 * x, 0])
    19     b = np.select([x < 1/4, x < 1/2, x <= 1, True],[1, 2 - 4 * x, 0, 0])
    20     return [r**2,g**2,b**2]
    21 
    22 def createData(dim, option, kind, count = dataSize):                                # 创建数据集,给定属性维度,每属性取值数,类别数,样本数
    23     np.random.seed(103)        
    24     X = np.random.randint(option, size = [count, dim])
    25     if kind == 2:                           
    26         Y = ((3 - 2 * dim) * X[:,0] + 2 * np.sum(X[:,1:], 1) > 0.5).astype(int)
    27     else: 
    28         randomVector = np.random.rand(dim)
    29         randomVector /= np.sum(randomVector)
    30         Y = (np.sum(X * randomVector,1) * kind / option).astype(int)                # 各类别不够均匀    
    31     #print(output)
    32     print("dim = %d, option = %d, kind = %d, dataSize = %d"%(dim, option, kind, count))
    33     kindCount = np.zeros(kind ,dtype = int)                                         # 各类别的占比
    34     for i in range(count):
    35         kindCount[Y[i]] += 1
    36     for i in range(kind):
    37         print("kind %d -> %4f"%(i, kindCount[i]/count)) 
    38     return X, Y
    39 
    40 def naïveBayes(dataX, dataY , λ = 1):               # λ 为平滑系数,0 时为极大似然估计,1 时为拉普拉斯平滑                              
    41     count, dim = np.shape(dataX)    
    42     option = len(set(dataX[:,0]))                   # 这里默认所有属性的取值个数相同
    43     kind = len(set(dataY))                          # 实际可以改进为一个列表来记录
    44     pY = np.zeros(kind)                             # 先验概率表计数
    45     for i in dataY:        
    46         pY[i] += 1         
    47     pX = np.zeros([kind, dim, option])              # 条件概率表,kind 行 dim 列(和 pY 能直接相乘)option 格,小格内为该属性各取值的计数
    48     for i in range(count):                          # 计数
    49         for col in range(dim):
    50             pX[dataY[i], col, dataX[i, col]] += 1                       
    51     for row in range(kind):                         # 逐小格归一化,用到了 pY 计数
    52         for col in range(dim):
    53             pX[row,col,:] = (pX[row,col,:] + λ)/(pY[row] + option * λ)
    54     
    55     pY = (pY + λ) / (np.sum(pY) + kind * λ)         # 先验概率表归一化
    56     return pX, pY
    57 
    58 def judge(x, para):                                 # 计算样本落入每种类别的后验概率,选择概率最大的那项
    59     table = para[1].copy()                          # 艹,必须用浅拷贝不能用等号,否则 para[1] 会随着 table 变化
    60     for i in range(len(x)):
    61         table *= para[0][:, i, x[i]]
    62     return np.argmax(table)
    63    
    64 def test(dim, option, kind):
    65     allX, allY = createData(dim, option, kind)            
    66     trainX, trainY, testX, testY = dataSplit(allX, allY, int(dataSize * trainRatio)) 
    67     para = naïveBayes(trainX, trainY)                               
    68     myResult = [ judge(testX[i], para) for i in range(len(testX)) ] # 存放测试结果
    69     
    70     errorRatio  = np.sum((np.array(myResult) != testY).astype(int)) / (dataSize * (1 - trainRatio)) # 计算分类错误率
    71     print("errorRatio = %4f"%errorRatio)                                                            # 离散取值,不画图了
    72 
    73 if __name__ == '__main__':    
    74     test(1, 2, 2)                                   # 1 属性 2 取值,分 2 类
    75     
    76     test(2, 3, 2)    
    77     test(2, 3, 4)
    78     
    79     test(3, 3, 2)           
    80     test(3, 4, 5)
    81     
    82     test(4, 3, 2)
    83     test(4, 5, 6)
    84     
    85     test(5, 3, 6)    

    ● 输出结果。速度非常快,可见随着属性数和每个属性的取值数的递增,分类错误率很快上涨。

    dim = 1, option = 2, kind = 2, dataSize = 10000
    kind 0 -> 0.498600
    kind 1 -> 0.501400
    errorRatio = 0.000000
    dim = 2, option = 3, kind = 2, dataSize = 10000
    kind 0 -> 0.446700
    kind 1 -> 0.553300
    errorRatio = 0.000000
    dim = 2, option = 3, kind = 4, dataSize = 10000
    kind 0 -> 0.336700
    kind 1 -> 0.330100
    kind 2 -> 0.333200
    kind 3 -> 0.000000
    errorRatio = 0.000000
    dim = 3, option = 3, kind = 2, dataSize = 10000
    kind 0 -> 0.444400
    kind 1 -> 0.555600
    errorRatio = 0.071714
    dim = 3, option = 4, kind = 5, dataSize = 10000
    kind 0 -> 0.144300
    kind 1 -> 0.430700
    kind 2 -> 0.315600
    kind 3 -> 0.109400
    kind 4 -> 0.000000
    errorRatio = 0.205000
    dim = 4, option = 3, kind = 2, dataSize = 10000
    kind 0 -> 0.452500
    kind 1 -> 0.547500
    errorRatio = 0.070429
    dim = 4, option = 5, kind = 6, dataSize = 10000
    kind 0 -> 0.056700
    kind 1 -> 0.269100
    kind 2 -> 0.415800
    kind 3 -> 0.229900
    kind 4 -> 0.028500
    kind 5 -> 0.000000
    errorRatio = 0.310714
    dim = 5, option = 3, kind = 6, dataSize = 10000
    kind 0 -> 0.096400
    kind 1 -> 0.399900
    kind 2 -> 0.408000
    kind 3 -> 0.092300
    kind 4 -> 0.003400
    kind 5 -> 0.000000
    errorRatio = 0.190857
  • 相关阅读:
    我了解到的新知识之----如何使用Python获取最新外汇汇率信息
    软工实践个人总结
    第06组 Beta版本演示
    第06组 Beta冲刺(5/5)
    第06组 Beta冲刺(4/5)
    第06组 Beta冲刺(3/5)
    第06组 Beta冲刺(2/5)
    第06组 Beta冲刺(1/5)
    第06组 Alpha事后诸葛亮
    第06组 Alpha冲刺(6/6)
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/11166919.html
Copyright © 2020-2023  润新知