logistic回归的基本思想
logistic回归是一种分类方法,用于两分类问题。其基本思想为:
a. 寻找合适的假设函数,即分类函数,用以预测输入数据的判断结果;
b. 构造代价函数,即损失函数,用以表示预测的输出结果与训练数据的实际类别之间的偏差;
c. 最小化代价函数,从而获取最优的模型参数。
1 import numpy 2 from numpy import * 3 import matplotlib.pyplot as plt 4 import random 5 def loadDataSet(filename): 6 fr = open(filename) 7 dataMat = [] 8 labelMat = [] 9 for line in fr.readlines(): 10 lineArr = line.strip().split() 11 dataMat.append( [1.0,float(lineArr[0]),float(lineArr[1])] ) 12 labelMat.append(int(lineArr[2])) 13 return dataMat,labelMat 14 15 #阶跃函数 16 def sigmoid(inX): 17 return 1.0/(1 + numpy.exp(-inX)) 18 19 #基于梯度上升法的logistic回归分类器 20 def gradAscent(dataMatIn,classLabels): 21 dataMatrix = mat(dataMatIn) 22 labelMatrix = mat(classLabels).transpose() 23 m , n = shape(dataMatrix) 24 alpha = 0.001#步长 25 maxCycles = 500 26 weights = ones((n,1)) 27 #对回归系数进行maxCycles次梯度上升 28 for i in range(maxCycles): 29 h = sigmoid(dataMatrix * weights) 30 error = labelMatrix - h 31 weights = weights + alpha * dataMatrix.transpose() * error 32 return weights 33 34 #分析数据:画出决策边界 35 def plotBestFit(weights): 36 dataMat,labelMat = loadDataSet('test.txt') 37 dataArr = array(dataMat) 38 n = list(shape(dataArr))[0] 39 xcord1 = [] ; ycord1 = [] 40 xcord2 = [] ; ycord2 = [] 41 for i in range(n): 42 if int(labelMat[i]) == 1: 43 xcord1.append(dataArr[i,1]) 44 ycord1.append(dataArr[i,2]) 45 else: 46 xcord2.append(dataArr[i,1]) 47 ycord2.append(dataArr[i,2]) 48 fig = plt.figure() 49 ax = fig.add_subplot(111) 50 ax.scatter(xcord1,ycord1,s=30,c='red',marker='s') 51 ax.scatter(xcord2,ycord2,s=30,c='green') 52 53 #最佳拟合直线 54 x = arange(-3.0, 3.0, 0.1) 55 print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',shape(x)) 56 57 y = (-weights[0] - weights[1] * x) / weights[2] 58 print('-----------------------------------------',shape(y)) 59 ax.plot(x,y) 60 plt.xlabel('X1') 61 plt.ylabel('X2') 62 plt.show() 63 64 #随机梯度上升 65 def stocGradAscent0(dataMatrix,classLabels): 66 m , n = numpy.shape(dataMatrix) 67 alpha = 0.01#步长 68 weights = numpy.ones((n)) 69 for i in range(m): 70 h = sigmoid(sum(dataMatrix[i] * weights)) 71 error = classLabels[i] - h 72 weights = weights + alpha * error * dataMatrix[i] 73 return weights 74 75 #改进的随机梯度上升 76 def stocGradAscent1(dataMatrix,classLabels,numIter=150): 77 m , n = shape(dataMatrix) 78 weights = ones(n) 79 dataIndex = list(range(m)) 80 print (dataIndex) 81 for j in range(numIter): 82 for i in range(m): 83 alpha = 4/(1.0+j+i) + 0.1 #alpha每次迭代都要调整 84 randIndex = int(random.uniform(0,len(dataIndex))) 85 h = sigmoid (sum(dataMatrix[randIndex] * weights)) 86 error = classLabels[randIndex] - h 87 weights = weights + alpha * error * dataMatrix[randIndex] 88 del dataIndex[randIndex] 89 print("randIndex",randIndex) 90 print("dataIndex",dataIndex) 91 if randIndex==0: 92 return weights 93 94 95 if __name__ == '__main__': 96 dataArr,labelMat = loadDataSet('test.txt') 97 weights = stocGradAscent1(array(dataArr),labelMat) 98 # weights = gradAscent(dataArr,labelMat) 99 # print(shape(weights)) 100 plotBestFit(weights)
应用:从疝气病预测病马的死亡率
import numpy from numpy import * import matplotlib.pyplot as plt import random #阶跃函数 def sigmoid(inX): return 1.0/(1 + numpy.exp(-inX)) #分类回归函数 def classifyVector(inX,weights): prob = sigmoid(sum(inX * weights)) if prob > 0.5: return 1.0 else: return 0.0 #改进的随机梯度上升算法 def stocGradAscent1(dataMatrix, classLabels, numIter=150): m, n = shape (dataMatrix) weights = ones (n) dataIndex = list (range (m)) for j in range (numIter): for i in range (m): alpha = 4 / (1.0 + j + i) + 0.1 # alpha每次迭代都要调整 randIndex = int (random.uniform (0, len (dataIndex))) h = sigmoid (sum (dataMatrix[randIndex] * weights)) error = classLabels[randIndex] - h weights = weights + alpha * error * dataMatrix[randIndex] del dataIndex[randIndex] if randIndex == 0: return weights #测试,返回错误率 def colicTest(): frTrain = open('horseColicTraining.txt') frTest = open('horseColicTest.txt') trainingSet = [] trainingLabels = [] for line in frTrain.readlines(): curLine = line.strip().split(' ') lineArr = [] for i in range(21): lineArr.append(float(curLine[i])) trainingSet.append(lineArr) trainingLabels.append(float(curLine[21])) trainWeights = stocGradAscent1(array(trainingSet),trainingLabels,500) errorCount = 0 numTestVec = 0 for line in frTest.readlines(): numTestVec += 1.0 curLine = line.strip().split(' ') lineArr = [] for i in range(21): lineArr.append(float(curLine[i])) if int(classifyVector(array(lineArr),trainWeights)) != int(curLine[21]): errorCount += 1 errorRate = (float(errorCount)/numTestVec) print("错误率",errorRate) return errorRate def multiTest(): numTests = 10 errorSum = 0.0 for i in range(numTests): errorSum += colicTest() print("%d 次迭代之后,平均错误率为%f"%(numTests,errorSum/float(numTests))) multiTest()