• 【机器学习实战】--第五章Logistic回归完整代码及注释


    可参考博客:

    https://blog.csdn.net/rujin_shi/article/details/78997271?utm_medium=distribute.pc_relevant_right.none-task-blog-BlogCommendFromMachineLearnPai2-5.nonecase&depth_1-utm_source=distribute.pc_relevant_right.none-task-blog-BlogCommendFromMachineLearnPai2-5.nonecase

    完整代码如下:

      1 import numpy as np
      2 import matplotlib.pyplot as plt
      3 # 参考https://blog.csdn.net/rujin_shi/article/details/78997271?utm_medium=distribute.pc_relevant_right.none-task-blog-BlogCommendFromMachineLearnPai2-5.nonecase&depth_1-utm_source=distribute.pc_relevant_right.none-task-blog-BlogCommendFromMachineLearnPai2-5.nonecase
      4 
      5 def loadDataSet():
      6     dataMat = []
      7     labelMat = []
      8     fr = open('../machinelearninginaction/Ch05/testSet.txt')
      9     for line in fr.readlines():
     10         lineArr = line.strip().split()
     11         dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])  # 存放3个特征X0,X1,X2
     12         labelMat.append(float(lineArr[2]))  # 存放标签
     13     return dataMat, labelMat
     14 
     15 
     16 def sigmoid(inX):
     17     return 1.0/(1+np.exp(-inX))
     18 
     19 
     20 def gradAscent(dataMatIn, classLabels):  # 梯度上升法
     21     dataMatrix = np.mat(dataMatIn)  # 转换为numpy矩阵
     22     labelMat = np.mat(classLabels).transpose()  # 转置为列向量
     23     m, n = np.shape(dataMatrix)  # 矩阵形状
     24     alpha = 0.01  # 向目标移动的步长
     25     maxCycles = 500  # 最大迭代次数
     26     weights = np.ones((n,1))  # 初始化参数为1
     27     for k in range(maxCycles):
     28         h = sigmoid(dataMatrix * weights)
     29         error = labelMat - h  # 为什么要这样计算?
     30         weights = weights + alpha * dataMatrix.transpose() * error
     31     return weights
     32 
     33 
     34 def stocGradAscent0(dataMatrix, classLabels):  # 随机梯度上升法
     35     m, n = np.shape(dataMatrix)
     36     alpha = 0.01
     37     weights = np.ones(n)
     38     for i in range(m):
     39         product = dataMatrix[i]*weights  # 对应元素相乘
     40         sum0 = sum(product)  # 将所有元素相加
     41         h = sigmoid(sum0)  # 求取sigmoid值
     42         error = classLabels[i] - h  # 做差
     43         weights = weights + alpha * error * dataMatrix[i]  # 更新权重
     44     return weights
     45 
     46 
     47 def stocGradAscent1(dataMatrix, classLabels, numIter=150):  # 改进的随机梯度上升法
     48     m,n = np.shape(dataMatrix)  # m为行,n为列
     49     weights = np.ones(n)
     50     for j in range(numIter):  # 默认迭代150次
     51         dataIndex = list(range(m))  # 这里要将range(m)改为list(range(m)),否则报错
     52         for i in range(m):
     53             alpha = 4/(1.0+j+i)+0.01  # 动态调整alpha
     54             randIndex = int(np.random.uniform(0, len(dataIndex)))  # 随机在0到len(dataIndex)之间选择一个数
     55             h = sigmoid(sum(dataMatrix[randIndex]*weights))  # 做sigmoid
     56             error = classLabels[randIndex] - h  # 做差
     57             weights = weights + alpha * error *dataMatrix[randIndex]  # 更新权重
     58             del(dataIndex[randIndex])  # 删除该值进行下一次迭代
     59     return weights
     60 
     61 
     62 def classifyVector(inX, weights):
     63     prob = sigmoid(sum(inX*weights))  # 做sigmoid
     64     if prob > 0.5:  # 大于0.5时输出类别为1
     65         return 1.0
     66     else:  # 小于0.5时输出类别为0
     67         return 0.0
     68 
     69 
     70 def colicTest():
     71     frTrain = open("../machinelearninginaction/Ch05/horseColicTraining.txt")  # 读取训练文件
     72     frTest = open("../machinelearninginaction/Ch05/horseColicTest.txt")  # 读取测试文件
     73     trainingSet = []; trainingLabels = []  # 设置两个空列表分别存放训练数据和训练标签
     74     for line in frTrain.readlines():  # 读取训练文件的每一行
     75         currLine = line.strip().split('	')  # 删除当前行的行首行尾空格,并按照Tab	分隔数据
     76         lineArr = []  # 建立空列表存放训练数据,每次循环都重新置为空
     77         for i in range(21):  # 有21个特征
     78             lineArr.append(float(currLine[i]))  # 将每个特征都存放在lineArr列表中
     79         trainingSet.append(lineArr)  # 将当前行的21个特征组成的列表添加到trainingSet中作为求解最佳拟合参数的输入数据
     80         trainingLabels.append(float(currLine[21]))  # 将当前行的标签添加到trainingLabels中
     81     trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels, 500)  # 使用改进的随机梯度上升法求解最佳拟合参数
     82     errorCount = 0; numTestVec = 0.0  # 定义错误数量和测试向量的数量
     83     for line in frTest.readlines():  # 读取测试数据的每一行
     84         numTestVec += 1.0  # 累计测试样本的个数
     85         currLine = line.strip().split('	')  # 删除空格分隔所有数据
     86         lineArr = []  # 用于存放测试数据的21个特征
     87         for i in range(21):
     88             lineArr.append(float(currLine[i]))  # 将测试数据的21个特征逐个添加到lineArr列表中
     89         if int(classifyVector(np.array(lineArr), trainWeights)) != int(currLine[21]):
     90             # 如果分类错误,errorCount加1
     91             errorCount += 1
     92     errorRate = (float(errorCount)/numTestVec)  # 计算错误率,分类错误的数量除以测试样本的总数
     93     print('the error rate of this is: %f' % errorRate)
     94     return errorRate
     95 
     96 
     97 def multiTest():
     98     numTests = 10; errorSum = 0.0  # 定义测试次数及错误数量的总和
     99     for k in range(numTests):
    100         errorSum += colicTest()  # 将每次测试的错误率相加求平均值
    101     print('after %d iterations the average error rate is: %f'%(numTests, errorSum/float(numTests)))
    102 
    103 
    104 def plotBestFit(dataArr, weights):
    105     dataArr = np.array(dataArr)
    106     n = np.shape(dataArr)[0]
    107     xcord1 = []; ycord1 = []
    108     xcord2 = []; ycord2 = []
    109     for i in range(n):
    110         if int(labelMat[i]) == 1:
    111             xcord1.append(dataArr[i, 1]);ycord1.append(dataArr[i,2])
    112         else:
    113             xcord2.append(dataArr[i,1]);ycord2.append(dataArr[i,2])
    114     fig = plt.figure()
    115     ax = fig.add_subplot(111)  # “111”表示“1×1网格,第一子图”,“234”表示“2×3网格,第四子图”
    116     ax.scatter(xcord1, ycord1, s=30, c='red', marker='^')
    117     # s表示size,(xcord1, ycord1)记录的是类别为1时X1与X2特征的值
    118     ax.scatter(xcord2, ycord2, s=30, c='green')
    119     # (xcord2, ycord2)记录的是类别为0时X1与X2特征的值
    120     x = np.arange(-3.0, 3.0, 0.1)  # 设置横坐标
    121     y = (-weights[0]-weights[1]*x)/weights[2]  # z=0时X2与X1的关系式,z=w0x0+w1x1+w2x2,其中z=0,x0=1,y即为X2
    122     # 当z=0,也就是sigmoid(z)=0.5时是0类和1类的分界点,z=0时对应着X1和X2的关系式,即可作为0,1类别的分界线
    123     ax.plot(x, y)
    124     plt.xlabel('X1');plt.ylabel('X2')
    125     plt.show()
    126 
    127 
    128 if __name__ == '__main__':
    129     dataArr, labelMat = loadDataSet()
    130     weights_0 = gradAscent(dataArr, labelMat)  # 梯度上升法求解最佳拟合参数
    131     # plotBestFit(dataArr, weights_0.getA())
    132     # getA()此函数将矩阵类型转化为数组,与mat函数正好相反;
    133     # print(weights)
    134     weights_1 = stocGradAscent0(np.array(dataArr), labelMat)  # 随机梯度上升法求解
    135     # plotBestFit(dataArr, weights_1)
    136     weights_2 = stocGradAscent1(np.array(dataArr), labelMat)  # 改进的随机梯度上升法
    137     # plotBestFit(dataArr, weights_2)
    138     multiTest()

    运行结果如下:

     1 the error rate of this is: 0.402985
     2 the error rate of this is: 0.432836
     3 the error rate of this is: 0.328358
     4 the error rate of this is: 0.373134
     5 the error rate of this is: 0.268657
     6 the error rate of this is: 0.358209
     7 the error rate of this is: 0.343284
     8 the error rate of this is: 0.477612
     9 the error rate of this is: 0.268657
    10 the error rate of this is: 0.298507
    11 after 10 iterations the average error rate is: 0.355224
  • 相关阅读:
    HDU 1257 最少拦截系统(最长递减子序列的条数)
    POJ 2063 Investment 滚动数组+完全背包
    POJ 2392 Space Elevator 贪心+dp
    CodeForces 154A Hometask dp
    CodeForces 57C Array 组合计数+逆元
    hdu 4398 Template Library Management(贪心+stl)
    优先队列详解(转载)
    hdu 4393 Throw nails(优先队列)
    hdu 4022 Bombing(map,multiset)
    hdu 1027 Ignatius and the Princess II(产生第m大的排列,next_permutation函数)
  • 原文地址:https://www.cnblogs.com/DJames23/p/13081692.html
Copyright © 2020-2023  润新知