• logistics回归


    logistic回归的基本思想  

      logistic回归是一种分类方法,用于两分类问题。其基本思想为:

      a. 寻找合适的假设函数,即分类函数,用以预测输入数据的判断结果;

      b. 构造代价函数,即损失函数,用以表示预测的输出结果与训练数据的实际类别之间的偏差;

      c. 最小化代价函数,从而获取最优的模型参数。

     

      1 import numpy
      2 from numpy import  *
      3 import matplotlib.pyplot as plt
      4 import random
      5 def loadDataSet(filename):
      6     fr = open(filename)
      7     dataMat = []
      8     labelMat = []
      9     for line in fr.readlines():
     10         lineArr = line.strip().split()
     11         dataMat.append( [1.0,float(lineArr[0]),float(lineArr[1])] )
     12         labelMat.append(int(lineArr[2]))
     13     return dataMat,labelMat
     14 
     15 #阶跃函数
     16 def sigmoid(inX):
     17     return 1.0/(1 + numpy.exp(-inX))
     18 
     19 #基于梯度上升法的logistic回归分类器
     20 def gradAscent(dataMatIn,classLabels):
     21     dataMatrix = mat(dataMatIn)
     22     labelMatrix = mat(classLabels).transpose()
     23     m , n = shape(dataMatrix)
     24     alpha = 0.001#步长
     25     maxCycles = 500
     26     weights = ones((n,1))
     27     #对回归系数进行maxCycles次梯度上升
     28     for i in range(maxCycles):
     29         h = sigmoid(dataMatrix * weights)
     30         error = labelMatrix - h
     31         weights = weights + alpha * dataMatrix.transpose() * error
     32     return weights
     33 
     34 #分析数据:画出决策边界
     35 def plotBestFit(weights):
     36     dataMat,labelMat = loadDataSet('test.txt')
     37     dataArr = array(dataMat)
     38     n = list(shape(dataArr))[0]
     39     xcord1 = [] ; ycord1 = []
     40     xcord2 = [] ; ycord2 = []
     41     for i in range(n):
     42         if int(labelMat[i]) == 1:
     43             xcord1.append(dataArr[i,1])
     44             ycord1.append(dataArr[i,2])
     45         else:
     46             xcord2.append(dataArr[i,1])
     47             ycord2.append(dataArr[i,2])
     48     fig = plt.figure()
     49     ax = fig.add_subplot(111)
     50     ax.scatter(xcord1,ycord1,s=30,c='red',marker='s')
     51     ax.scatter(xcord2,ycord2,s=30,c='green')
     52 
     53     #最佳拟合直线
     54     x = arange(-3.0, 3.0, 0.1)
     55     print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',shape(x))
     56 
     57     y = (-weights[0] - weights[1] * x) / weights[2]
     58     print('-----------------------------------------',shape(y))
     59     ax.plot(x,y)
     60     plt.xlabel('X1')
     61     plt.ylabel('X2')
     62     plt.show()
     63 
     64 #随机梯度上升
     65 def stocGradAscent0(dataMatrix,classLabels):
     66     m , n = numpy.shape(dataMatrix)
     67     alpha = 0.01#步长
     68     weights = numpy.ones((n))
     69     for i in range(m):
     70         h = sigmoid(sum(dataMatrix[i] * weights))
     71         error = classLabels[i] - h
     72         weights = weights + alpha * error * dataMatrix[i]
     73     return weights
     74 
     75 #改进的随机梯度上升
     76 def stocGradAscent1(dataMatrix,classLabels,numIter=150):
     77     m , n = shape(dataMatrix)
     78     weights = ones(n)
     79     dataIndex = list(range(m))
     80     print (dataIndex)
     81     for j in range(numIter):
     82         for i in range(m):
     83             alpha = 4/(1.0+j+i) + 0.1   #alpha每次迭代都要调整
     84             randIndex = int(random.uniform(0,len(dataIndex)))
     85             h = sigmoid (sum(dataMatrix[randIndex] * weights))
     86             error = classLabels[randIndex] - h
     87             weights = weights + alpha * error * dataMatrix[randIndex]
     88             del dataIndex[randIndex]
     89             print("randIndex",randIndex)
     90             print("dataIndex",dataIndex)
     91             if randIndex==0:
     92                 return weights
     93 
     94 
     95 if __name__ == '__main__':
     96     dataArr,labelMat = loadDataSet('test.txt')
     97     weights = stocGradAscent1(array(dataArr),labelMat)
     98     # weights = gradAscent(dataArr,labelMat)
     99     # print(shape(weights))
    100     plotBestFit(weights)

     应用:从疝气病预测病马的死亡率

    import numpy
    from numpy import  *
    import matplotlib.pyplot as plt
    import random
    
    #阶跃函数
    def sigmoid(inX):
        return 1.0/(1 + numpy.exp(-inX))
    
    #分类回归函数
    def classifyVector(inX,weights):
        prob = sigmoid(sum(inX * weights))
        if prob > 0.5:
            return 1.0
        else:
            return 0.0
    
    #改进的随机梯度上升算法
    def stocGradAscent1(dataMatrix, classLabels, numIter=150):
        m, n = shape (dataMatrix)
        weights = ones (n)
        dataIndex = list (range (m))
        for j in range (numIter):
            for i in range (m):
                alpha = 4 / (1.0 + j + i) + 0.1  # alpha每次迭代都要调整
                randIndex = int (random.uniform (0, len (dataIndex)))
                h = sigmoid (sum (dataMatrix[randIndex] * weights))
                error = classLabels[randIndex] - h
                weights = weights + alpha * error * dataMatrix[randIndex]
                del dataIndex[randIndex]
                if randIndex == 0:
                    return weights
    
    #测试,返回错误率
    def colicTest():
        frTrain = open('horseColicTraining.txt')
        frTest = open('horseColicTest.txt')
        trainingSet = []
        trainingLabels = []
        for line in frTrain.readlines():
            curLine = line.strip().split('	')
            lineArr = []
            for i in range(21):
                lineArr.append(float(curLine[i]))
            trainingSet.append(lineArr)
            trainingLabels.append(float(curLine[21]))
        trainWeights =  stocGradAscent1(array(trainingSet),trainingLabels,500)
    
        errorCount = 0
        numTestVec = 0
        for line in frTest.readlines():
            numTestVec += 1.0
            curLine = line.strip().split('	')
            lineArr = []
            for i in range(21):
                lineArr.append(float(curLine[i]))
            if int(classifyVector(array(lineArr),trainWeights)) != int(curLine[21]):
                errorCount += 1
        errorRate = (float(errorCount)/numTestVec)
        print("错误率",errorRate)
        return errorRate
    
    
    def multiTest():
        numTests = 10
        errorSum = 0.0
        for i in range(numTests):
            errorSum += colicTest()
        print("%d 次迭代之后,平均错误率为%f"%(numTests,errorSum/float(numTests)))
    
    multiTest()
    

      

  • 相关阅读:
    [数学]如何旋转曲线
    19_04_25校内训练[最小割]
    第二类斯特林数总结
    19_04_19校内训练[Game]
    kd-tree题目总结
    [HNOI2019]校园旅行
    LCT模板(无讲解)
    min_25筛题目总结
    Miller Robbin测试模板(无讲解)
    19_04_02校内训练[图染色]
  • 原文地址:https://www.cnblogs.com/nxf-rabbit75/p/8977621.html
Copyright © 2020-2023  润新知