《统计学习方法》第六章，逻辑回归

▶ 使用逻辑回归来进行二分类，其中用到了梯度下降来进行数值优化

● 代码

  1 import numpy as np
  2 import matplotlib.pyplot as plt
  3 from mpl_toolkits.mplot3d import Axes3D
  4 from mpl_toolkits.mplot3d.art3d import Poly3DCollection
  5 from matplotlib.patches import Rectangle
  6 
  7 dataSize = 10000
  8 trainRatio = 0.3
  9 maxTurn = 500
 10 ita = 0.1
 11 epsilon = 0.005
 12 
 13 colors = [[0.5,0.25,0],[1,0,0],[0,0.5,0],[0,0,1],[1,0.5,0]] # 棕红绿蓝橙
 14 trans = 0.5
 15 
 16 def dataSplit(x, y, part):                                                         
 17     return x[:part], y[:part],x[part:],y[part:]
 18 
 19 def sigmod(x):                                              # 逻辑回归函数
 20     return 1.0 / (1 + np.exp(-x))
 21 
 22 def function(x,para):                                       # 连续回归函数
 23     return np.sum(x * para[0]) + para[1]
 24 
 25 def judge(x, para):                                         # 分类函数，由乘加部分和阈值部分组成
 26     return sigmod(function(x, para) - 0.5)
 27 
 28 def createData(dim, kind, count = dataSize):                                    # 创建数据集，给定属性维度，每属性取值数，类别数，样本数
 29     np.random.seed(103)        
 30     X = np.random.rand(count, dim)    
 31     Y = ((3 - 2 * dim)*X[:,0] + 2 * np.sum(X[:,1:], 1) > 0.5).astype(int) 
 32     #print(output)
 33     print("dim = %d, kind = %d, dataSize = %d"%(dim, kind, count))
 34     kindCount = np.zeros(kind ,dtype = int)                                     # 各类别的占比
 35     for i in range(count):
 36         kindCount[Y[i]] += 1
 37     for i in range(kind):
 38         print("kind %d -> %4f"%(i, kindCount[i]/count))                                      
 39     return X, Y
 40 
 41 def logisticRegression(dataX, dataY):                                           # 分类函数
 42     count, dim = np.shape(dataX)
 43     xE = np.concatenate((dataX, np.ones(count)[:,np.newaxis]), axis = 1)        # x 增加一列
 44     w = np.zeros(dim + 1)                                             
 45     finishFlag = False    
 46     turn = 0
 47     while finishFlag == False and turn < maxTurn:                               # 计算似然函数
 48         temp = sigmod(np.sum(xE * w, 1))
 49         error = dataY - temp
 50         grad = np.sum(xE.T * error, 1)        
 51         w += ita * grad                                                         # 使用梯度下降法优化 w
 52         turn += 1        
 53         #print("turn = ", turn, ", w = ", w, ", error = ", np.sum(np.abs(error)) / count)
 54         if np.sum(np.abs(error)) < count * epsilon:
 55             finishFlag = True
 56             break            
 57     print("turn = ", turn, ", w = ", w)
 58     return (w[:-1],w[-1])
 59 
 60 def test(dim, kind):                                                
 61     allX, allY = createData(dim, kind)            
 62     trainX, trainY, testX, testY = dataSplit(allX, allY, int(dataSize * trainRatio))    # 分离训练集 
 63     para = logisticRegression(trainX, trainY)    
 64 
 65     myResult = [ int(judge(i, para) > 0.5) for i in testX ]               
 66     errorRatio  = np.sum((np.array(myResult) != testY).astype(int)) / (dataSize * (1 - trainRatio))
 67     print("dim = %d, errorRatio = %f"%(dim, round(errorRatio,4)))
 68     if dim >= 4:                                                                        # 4维以上不画图，只输出测试错误率
 69         return
 70 
 71     errorP = []                                                                         # 画图部分，测试数据集分为错误类，1 类和 0 类
 72     class1 = []
 73     class0 = []
 74     for i in range(int(dataSize * (1-trainRatio))):
 75         if int(myResult[i] > 0.5) * int(testY[i]) < 0:
 76             errorP.append(testX[i])
 77         elif myResult[i] > 0.5:
 78             class1.append(testX[i])
 79         else:
 80             class0.append(testX[i])
 81     errorP = np.array(errorP)
 82     class1 = np.array(class1)
 83     class0 = np.array(class0)
 84 
 85     fig = plt.figure(figsize=(10, 8))                
 86     
 87     if dim == 1:
 88         plt.xlim(0.0,1.0)
 89         plt.ylim(-0.25,1.25)
 90         plt.plot([0.5, 0.5], [-0.5, 1.25], color = colors[0],label = "realBoundary")                
 91         plt.plot([0, 1], [ function(i, para) for i in [0,1] ],color = colors[4], label = "myF")
 92         plt.scatter(class1, np.ones(len(class1)),color = colors[1], s = 2,label = "class1Data")                
 93         plt.scatter(class0, np.zeros(len(class0)),color = colors[2], s = 2,label = "class0Data")                
 94         if len(errorP) != 0:
 95             plt.scatter(errorP[:,0], errorP[:,1],color = colors[3], s = 16,label = "errorData")        
 96         plt.text(0.22, 1.12, "realBoundary: 2x = 1
myF(x) = " + str(round(para[0][0],2)) + " x + " + str(round(para[1],2)) + "
 errorRatio = " + str(round(errorRatio,4)),
 97             size=15, ha="center", va="center", bbox=dict(boxstyle="round", ec=(1., 0.5, 0.5), fc=(1., 1., 1.)))
 98         R = [Rectangle((0,0),0,0, color = colors[k]) for k in range(5)]
 99         plt.legend(R, ["realBoundary", "class1Data", "class0Data", "errorData", "myF"], loc=[0.81, 0.2], ncol=1, numpoints=1, framealpha = 1)        
100     
101     if dim == 2:        
102         plt.xlim(0.0,1.0)
103         plt.ylim(0.0,1.0)
104         plt.plot([0,1], [0.25,0.75], color = colors[0],label = "realBoundary")        
105         xx = np.arange(0, 1 + 0.1, 0.1)                
106         X,Y = np.meshgrid(xx, xx)
107         contour = plt.contour(X, Y, [ [ function((X[i,j],Y[i,j]), para) for j in range(11)] for i in range(11) ])
108         plt.clabel(contour, fontsize = 10,colors='k')
109         plt.scatter(class1[:,0], class1[:,1],color = colors[1], s = 2,label = "class1Data")        
110         plt.scatter(class0[:,0], class0[:,1],color = colors[2], s = 2,label = "class0Data")        
111         if len(errorP) != 0:
112             plt.scatter(errorP[:,0], errorP[:,1],color = colors[3], s = 8,label = "errorData")        
113         plt.text(0.71, 0.92, "realBoundary: -x + 2y = 1
myF(x,y) = " + str(round(para[0][0],2)) + " x + " + str(round(para[0][1],2)) + " y + " + str(round(para[1],2)) + "
 errorRatio = " + str(round(errorRatio,4)), 
114             size = 15, ha="center", va="center", bbox=dict(boxstyle="round", ec=(1., 0.5, 0.5), fc=(1., 1., 1.)))
115         R = [Rectangle((0,0),0,0, color = colors[k]) for k in range(4)]
116         plt.legend(R, ["realBoundary", "class1Data", "class0Data", "errorData"], loc=[0.81, 0.2], ncol=1, numpoints=1, framealpha = 1)     
117 
118     if dim == 3:        
119         ax = Axes3D(fig)
120         ax.set_xlim3d(0.0, 1.0)
121         ax.set_ylim3d(0.0, 1.0)
122         ax.set_zlim3d(0.0, 1.0)
123         ax.set_xlabel('X', fontdict={'size': 15, 'color': 'k'})
124         ax.set_ylabel('Y', fontdict={'size': 15, 'color': 'k'})
125         ax.set_zlabel('W', fontdict={'size': 15, 'color': 'k'})
126         v = [(0, 0, 0.25), (0, 0.25, 0), (0.5, 1, 0), (1, 1, 0.75), (1, 0.75, 1), (0.5, 0, 1)]
127         f = [[0,1,2,3,4,5]]
128         poly3d = [[v[i] for i in j] for j in f]
129         ax.add_collection3d(Poly3DCollection(poly3d, edgecolor = 'k', facecolors = colors[0]+[trans], linewidths=1))        
130         ax.scatter(class1[:,0], class1[:,1],class1[:,2], color = colors[1], s = 2, label = "class1")                       
131         ax.scatter(class0[:,0], class0[:,1],class0[:,2], color = colors[2], s = 2, label = "class0")                       
132         if len(errorP) != 0:
133             ax.scatter(errorP[:,0], errorP[:,1],errorP[:,2], color = colors[3], s = 8, label = "errorData")                
134         ax.text3D(0.75, 0.85, 1.15, "realBoundary: -3x + 2y +2z = 1
myF(x,y,z) = " + str(round(para[0][0],2)) + " x + " + 
135             str(round(para[0][1],2)) + " y + " + str(round(para[0][2],2)) + " z + " + str(round(para[1],2)) + "
 errorRatio = " + str(round(errorRatio,4)), 
136             size = 12, ha="center", va="center", bbox=dict(boxstyle="round", ec=(1, 0.5, 0.5), fc=(1, 1, 1)))
137         R = [Rectangle((0,0),0,0, color = colors[k]) for k in range(4)]
138         plt.legend(R, ["realBoundary", "class1Data", "class0Data", "errorData"], loc=[0.83, 0.1], ncol=1, numpoints=1, framealpha = 1)
139         
140     fig.savefig("R:\dim" + str(dim) + ".png")
141     plt.close()  
142 
143 if __name__=='__main__':        
144     test(1, 2)        
145     test(2, 2)    
146     test(3, 2)
147     test(4, 2)           
148

● 输出结果

dim = 1, kind = 2, dataSize = 10000
kind 0 -> 0.491000
kind 1 -> 0.509000
turn =  500 , w =  [ 225.51391655 -112.78995202]
dim = 1, errorRatio = 0.002400
dim = 2, kind = 2, dataSize = 10000
kind 0 -> 0.504000
kind 1 -> 0.496000
turn =  500 , w =  [-136.38367654  271.82263114  -67.80046401]
dim = 2, errorRatio = 0.002400
dim = 3, kind = 2, dataSize = 10000
kind 0 -> 0.501800
kind 1 -> 0.498200
turn =  76 , w =  [-269.48732449  178.37270183  180.50103751  -45.74231996]
dim = 3, errorRatio = 0.004700
dim = 4, kind = 2, dataSize = 10000
kind 0 -> 0.503100
kind 1 -> 0.496900
turn =  74 , w =  [-335.00400878  133.17805105  132.39352473  132.93672898  -31.98460298]
dim = 4, errorRatio = 0.002100

● 画图

相关阅读:
Java 中的 TreeSet
自定义全局异常处理器（Java）
GitHub Open Feign 使用总结
 搭建 Maven 仓库
 MySQL 的 SQL 笔记
 一道算法题：查找数组 arr 中第 k 小的奇数
 Java 的参数传递
 接口限流的几种算法
 Java 实现一个自定义分布式 Session
函数反汇编
原文地址：https://www.cnblogs.com/cuancuancuanhao/p/11213452.html