• 读书笔记:机器学习实战(4)——章五的逻辑回归代码和个人理解


    个人理解,Logistic回归最根本的理论基础就是利用了Sigmod函数或者tan等,能够较平滑的(相对其他的单位阶跃函数)表示非0即1,或者-1和1,以及其中间值。这样将样本的特征向量的每一个维度或者(每一种属性,参数)都赋予一个权重系数,所有的属性值和其权重得到的权值的和作为改样本最终的分类参考值,这个值更偏向于哪一边样本就被划分为哪一类。
    而Sigmod的这种非0即1的特点,可以错误分为1的样本,被其标签0减去后,得到一个负数;相对的,标签为1的训练样本被分为0,也会得到一个正向的纠正参数,以这个参数加上步长来逐渐的使每个样本的最终分类参考值向其正确的标签类别靠近,最终使通过训练样本训练的权重系数,能使越来越多的样本正确分类,满足误差最小值,就可以简单的用来预测测试样本了。
    回归预测时,有时实际数据集受相邻点影响较大,可以采用加权回归,按距离远近(时间等)对邻近点的作用进行加权,初学时,建议温习下最小二乘法。
    代码为书中代码,没改动。

    from numpy import *
    import matplotlib.pyplot as plt
    def loadDataSet():
        dataMat = []; labelMat = []
        fr = open('testSet.txt')
        for line in fr.readlines():
            lineArr = line.strip().split()
            dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
            labelMat.append(int(lineArr[2]))
        return dataMat,labelMat
    
    def sigmoid(inX):
        return 1.0/(1+exp(-inX))
    
    def gradAscent(dataMatIn, classLabels):
        dataMatrix = mat(dataMatIn)             #convert to NumPy matrix
        labelMat = mat(classLabels).transpose() #convert to NumPy matrix
        m,n = shape(dataMatrix)
        alpha = 0.001
        maxCycles = 500
        weights = ones((n,1))
        for k in range(maxCycles):              #heavy on matrix operations
            h = sigmoid(dataMatrix*weights)     #matrix mult
            error = (labelMat - h)              #vector subtraction
            weights = weights + alpha * dataMatrix.transpose()* error #matrix mult
        return weights
    
    def plotBestFit(weights):
        # import matplotlib.pyplot as plt
        dataMat,labelMat=loadDataSet()
        dataArr = array(dataMat)
        n = shape(dataArr)[0] 
        xcord1 = []; ycord1 = []
        xcord2 = []; ycord2 = []
        for i in range(n):
            if int(labelMat[i])== 1:
                xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
            else:
                xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
        ax.scatter(xcord2, ycord2, s=30, c='green')
        x = arange(-3.0, 3.0, 0.1)
        y = (-weights[0]-weights[1]*x)/weights[2]
        ax.plot(x, y)
        plt.xlabel('X1'); plt.ylabel('X2');
        plt.show()
    
    def stocGradAscent0(dataMatrix, classLabels):
        m,n = shape(dataMatrix)
        alpha = 0.01
        weights = ones(n)   #initialize to all ones
        for i in range(m):
            h = sigmoid(sum(dataMatrix[i]*weights))
            error = classLabels[i] - h
            weights = weights + alpha * error * dataMatrix[i]
        return weights
    
    def stocGradAscent1(dataMatrix, classLabels, numIter=150):
        m,n = shape(dataMatrix)
        weights = ones(n)   #initialize to all ones
        for j in range(numIter):
            dataIndex = range(m)
            for i in range(m):
                alpha = 4/(1.0+j+i)+0.0001    #apha decreases with iteration, does not 
                randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant
                h = sigmoid(sum(dataMatrix[randIndex]*weights))
                error = classLabels[randIndex] - h
                weights = weights + alpha * error * dataMatrix[randIndex]
                del(dataIndex[randIndex])
        return weights
    
    def classifyVector(inX, weights):
        prob = sigmoid(sum(inX*weights))
        if prob > 0.5: return 1.0
        else: return 0.0
    
    def colicTest():
        frTrain = open('horseColicTraining.txt'); frTest = open('horseColicTest.txt')
        trainingSet = []; trainingLabels = []
        for line in frTrain.readlines():
            currLine = line.strip().split('	')
            lineArr =[]
            for i in range(21):
                lineArr.append(float(currLine[i]))
            trainingSet.append(lineArr)
            trainingLabels.append(float(currLine[21]))
        trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)
        errorCount = 0; numTestVec = 0.0
        for line in frTest.readlines():
            numTestVec += 1.0
            currLine = line.strip().split('	')
            lineArr =[]
            for i in range(21):
                lineArr.append(float(currLine[i]))
            if int(classifyVector(array(lineArr), trainWeights))!= int(currLine[21]):
                errorCount += 1
        errorRate = (float(errorCount)/numTestVec)
        print "the error rate of this test is: %f" % errorRate
        return errorRate
    
    def multiTest():
        numTests = 10; errorSum=0.0
        for k in range(numTests):
            errorSum += colicTest()
        print "after %d iterations the average error rate is: %f" % (numTests, errorSum/float(numTests))
    
    if __name__ == "__main__":
        (dataMat,labelMat) = loadDataSet()
        print "dataMat:"
        print dataMat
        print "labeMat:"
        print labelMat
        weights = gradAscent(dataMat, labelMat)
        print "weights:"
        print weights
        plotBestFit(weights)
        print "end"
  • 相关阅读:
    JDOJ 2197: 校门外的树
    简单线段树知识点详解
    求GCD(最大公约数)的两种方式
    USACO Buying Feed, II
    USACO Dueling GPS's
    USACO Milking Cows
    NOIP 2014 比例简化
    USACO Clumsy Cows
    JDOJ 1140: 完数
    NOIP 2008 火柴棒等式
  • 原文地址:https://www.cnblogs.com/zhangdebin/p/5567919.html
Copyright © 2020-2023  润新知