1 import numpy as np
2 import os
3
4
5 class optStruct:
6 # 建立一个数据结构来保存所有重要的值,仅包含__init__方法,该方法可以实现其成员变量的填充
7 def __init__(self, dataMatIn, classLabels, C, toler, kTup):
8 # kTup是一个包含核函数信息的元组
9 self.X = dataMatIn
10 self.labelMat = classLabels
11 self.C = C
12 self.tol = toler
13 self.m = np.shape(dataMatIn)[0]
14 self.alphas = np.mat(np.zeros((self.m, 1)))
15 self.b = 0
16 # eCache的第一列给出的是eCache是否有效的标志位,第二列给出的是实际的E值
17 self.eCache = np.mat(np.zeros((self.m, 2)))
18 self.K = np.mat(np.zeros((self.m, self.m)))
19 for i in range(self.m):
20 self.K[:, i] = kernelTrans(self.X, self.X[i, :], kTup)
21
22
23 def calcEk(oS, k):
24 # 计算E值并返回
25 # fXk = float(np.multiply(oS.alphas, oS.labelMat).T*(oS.X*oS.X[k,:].T)) + oS.b
26 fXk = float(np.multiply(oS.alphas, oS.labelMat).T * oS.K[:, k] + oS.b)
27 Ek = fXk - float(oS.labelMat[k])
28 return Ek
29
30
31 def selectJ(i, oS, Ei):
32 # 用于选择第二个alpha或者说内循环的alpha值,这里的目标是选择合适的第二个alpha值以保证在每次优化中采用最大步长
33 # 该函数的误差值与第一个alpha值Ei和下标i有关
34 maxK = -1
35 maxDeltaE = 0
36 Ej = 0
37 oS.eCache[i] = [1, Ei]
38 validEcacheList = np.nonzero(oS.eCache[:, 0].A)[0] # 构建一个非零表。返回的是非零E值所对应的alpha值
39 # np.nonzero函数是numpy中用于得到数组array中非零元素的位置(数组索引)的函数。
40 if len(validEcacheList) > 1:
41 for k in validEcacheList:
42 if k == i:
43 continue
44 Ek = calcEk(oS, k)
45 deltaE = abs(Ei - Ek)
46 if deltaE > maxDeltaE:
47 maxK = k
48 maxDeltaE = deltaE
49 Ej = Ek
50 # 选择具有最大步长的j
51 return maxK, Ej
52 else:
53 j = selectJrand(i, oS.m)
54 Ej = calcEk(oS, j)
55 return j, Ej
56
57
58 def updateEk(oS, k):
59 # 计算误差值并存入缓存当中。在对alpha值进行优化之后会用到这个值
60 Ek = calcEk(oS, k)
61 oS.eCache[k] = [1, Ek]
62
63
64 def innerL(i, oS):
65 Ei = calcEk(oS, i)
66 if ((oS.labelMat[i]*Ei<-oS.tol) and (oS.alphas[i]<oS.C)) or ((oS.labelMat[i]*Ei>oS.tol) and (oS.alphas[i]>0)):
67 j, Ej = selectJ(i, oS, Ei)
68 alphaIold = oS.alphas[i].copy()
69 alphaJold = oS.alphas[j].copy()
70 if oS.labelMat[i] != oS.labelMat[j]:
71 L = max(0, oS.alphas[j] - oS.alphas[i])
72 H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
73 else:
74 L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
75 H = min(oS.C, oS.alphas[j] + oS.alphas[i])
76 if L == H:
77 print('L == H')
78 return 0
79 # eta = 2.0*oS.X[i,:]*oS.X[j,:].T-oS.X[i,:]*oS.X[i,:].T-oS.X[j,:]*oS.X[j,:].T
80 eta = 2.0*oS.K[i,j]-oS.K[i,i]-oS.K[j,j]
81 if eta >= 0:
82 print('eta>=0')
83 return 0
84 oS.alphas[j] -= oS.labelMat[j]*(Ei-Ej)/eta
85 oS.alphas[j] = clipAlpha(oS.alphas[j],H,L)
86 updateEk(oS,j)
87 if abs(oS.alphas[j] - alphaJold) < 0.00001:
88 print('j not moving enough')
89 return 0
90 oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i]*(alphaJold - oS.alphas[j])
91 updateEk(oS, i)
92 # b1 = oS.b - Ei - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[i,:].T-oS.labelMat[j]*
93 # (oS.alphas[j]-alphaJold)*oS.X[i,:]*oS.X[j,:].T
94 # b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[j,:].T-oS.labelMat[j]*
95 # (oS.alphas[j]-alphaJold)*oS.X[j,:]*oS.X[j,:].T
96 b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, i] - oS.labelMat[j] *
97 (oS.alphas[j]-alphaJold)*oS.K[i,j]
98 b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]-oS.labelMat[j]*
99 (oS.alphas[j]-alphaJold)*oS.K[j,j]
100 if (0<oS.alphas[i]) and (oS.C>oS.alphas[i]):
101 oS.b = b1
102 elif (0<oS.alphas[j]) and (oS.C>oS.alphas[j]):
103 oS.b = b2
104 else:
105 oS.b = (b1+b2)/2.0
106 return 1
107 else:
108 return 0
109
110
111 def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup=('lin', 0)):
112 # 实例化对象oS,用来容纳所有数据
113 oS = optStruct(np.mat(dataMatIn), np.mat(classLabels).transpose(),C,toler, kTup)
114 iter = 0
115 entireSet = True
116 alphaPairsChanged = 0
117 while iter < maxIter and (alphaPairsChanged >0 or entireSet):
118 # 当迭代次数超过指定的最大值,或者遍历整个集合都未对任意alpha对进行修改时,就退出循环
119 alphaPairsChanged = 0
120 if entireSet:
121 for i in range(oS.m):
122 alphaPairsChanged += innerL(i, oS)
123 print('fullSet, iter: %d i: %d, pairs changed %d'%(iter, i, alphaPairsChanged))
124 iter += 1
125 else:
126 nonBoundsIs = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
127 # matrix.A是将矩阵转换为ndarray
128 # >>> type(c)
129 # <class 'numpy.matrix'>
130 # >>> type(c.A)
131 # <class 'numpy.ndarray'>
132 for i in nonBoundsIs:
133 print('non-bound,iter: %d i: %d, pairs changed %d' % (iter, i, alphaPairsChanged))
134 iter += 1
135 if entireSet:
136 entireSet = False
137 elif alphaPairsChanged == 0:
138 entireSet = True
139 print('iteration number: %d' % iter)
140 return oS.b, oS.alphas
141
142
143 def clacWs(alphas, dataArr, classLabels):
144 X = np.mat(dataArr)
145 labelMat = np.mat(classLabels).transpose()
146 m, n = np.shape(X)
147 w = np.zeros((n, 1))
148 for i in range(m):
149 w += np.multiply(alphas[i] * labelMat[i], X[i,:].T)
150 return w
151
152
153 def kernelTrans(X, A, kTup):
154 '''
155
156 :param X: 所有数据集
157 :param A: 数据集中的一行
158 :param kTup: 元组kTup给出的是核函数的信息,元组的第一个参数是描述所用核函数类型的一个字符串,
159 其他两个参数则都是核函数可能需要的可选参数
160 :return:
161 '''
162 m, n = np.shape(X)
163 K = np.mat(np.zeros((m, 1)))
164 if kTup[0] == 'lin':
165 # 线性核函数
166 K = X * A.T
167 elif kTup[0] == 'rbf':
168 # 径向基核函数
169 for j in range(m):
170 deltaRow = X[j, :] - A
171 K[j] = deltaRow * deltaRow.T
172 K = np.exp(K / (-1*kTup[1]**2))
173 else:
174 raise NameError('Houston We Have a Problem -- That Kernel is not recognized')
175 return K
176
177
178 def testRbf(k1 = 1.3):
179 '''
180
181 :param k1: 高斯径向基函数中的一个用户定义变量
182 :return:
183 '''
184 # 读入数据集
185 dataArr, labelArr = loadDataSet("../machinelearninginaction/Ch06/testSetRBF.txt")
186 # 运行Platt Smo算法,核函数类型为rbf
187 b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1))
188 # 建立数据矩阵副本
189 dataMat = np.mat(dataArr)
190 labelMat = np.mat(labelArr).transpose()
191 # 找出非零的alpha值
192 svInd = np.nonzero(alphas.A > 0)[0]
193 # 得到所需要的支持向量
194 sVs = dataMat[svInd]
195 # 得到类别标签值
196 labelSV = labelMat[svInd]
197 print('there are %d Support Vectors' % np.shape(sVs)[0])
198 m, n = np.shape(dataMat)
199 errorCount = 0
200 for i in range(m):
201 # 利用核函数进行分类
202 kernelEval = kernelTrans(sVs, dataMat[i, :], ('rbf', k1)) # 得到转换后的数据
203 predict = kernelEval.T * np.multiply(labelSV, alphas[svInd]) + b
204 if np.sign(predict) != np.sign(labelArr[i]):
205 # sign()是Python的Numpy中的取数字符号(数字前的正负号)的函数。大于0时取1,小于0时取-1,等于0时取0
206 errorCount += 1
207 print('the training error rate is: %f' % (float(errorCount)/m))
208 dataArr, labelArr = loadDataSet("../machinelearninginaction/Ch06/testSetRBF2.txt")
209 errorCount = 0
210 dataMat = np.mat(dataArr)
211 labelMat = np.mat(labelArr).transpose()
212 m, n = np.shape(dataMat)
213 for i in range(m):
214 kernelEval = kernelTrans(sVs, dataMat[i, :], ('rbf', k1))
215 predict = kernelEval.T * np.multiply(labelSV, alphas[svInd]) + b
216 if np.sign(predict) != np.sign(labelArr[i]):
217 errorCount += 1
218 print('the test error rate is: %f' % (float(errorCount)/m))
219
220
221 '''
222 ***************************************************************************
223 '''
224 # 手写数字识别
225 def img2vector(filename):
226 returnVect = np.zeros((1, 1024)) # 创建1行1024列的零数组
227 fr = open(filename)
228 for i in range(32):
229 lineStr = fr.readline()
230 for j in range(32):
231 returnVect[0, 32*i+j] = int(lineStr[j]) # 将32x32的图片转换成1x1024的行向量
232 return returnVect
233
234
235 def loadImages(dirName):
236 hwLabels = []
237 trainingFileList = os.listdir(dirName) # 返回包含文件夹下的所有文件名的列表
238 m = len(trainingFileList) # 获取所有文件的个数
239 trainingMat = np.zeros((m, 1024))
240 for i in range(m):
241 fileNameStr = trainingFileList[i]
242 fileStr = fileNameStr.split('.')[0]
243 classNumStr = int(fileStr.split('_')[0])
244 # 二分类,只识别9
245 if classNumStr == 9:
246 hwLabels.append(-1)
247 else:
248 hwLabels.append(1)
249 trainingMat[i, :] = img2vector('%s/%s'%(dirName, fileNameStr))
250 return trainingMat, hwLabels
251
252
253 def testDigits(kTup=('rbf', 10)):
254 dataArr, labelArr = loadImages('../machinelearninginaction/Ch02/digits/trainingDigits')
255 b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup)
256 dataMat = np.mat(dataArr)
257 labelMat = np.mat(labelArr).transpose()
258 svInd = np.nonzero(alphas.A>0)[0]
259 sVs = dataMat[svInd]
260 labelSV = labelMat[svInd]
261 print('there are %d Support Vectors' % np.shape(sVs)[0])
262 m, n = np.shape(dataMat)
263 errorCount = 0
264 for i in range(m):
265 kernelEval = kernelTrans(sVs, dataMat[i, :], kTup)
266 predict = kernelEval.T*np.multiply(labelSV, alphas[svInd]) + b
267 if np.sign(predict) != np.sign(labelArr[i]):
268 errorCount += 1
269 print('the training error rate is %f' % (float(errorCount)/m))
270 dataArr, labelArr = loadImages('../machinelearninginaction/Ch02/digits/testDigits')
271 dataMat = np.mat(dataArr)
272 labelMat = np.mat(labelArr).transpose()
273 m, n = np.shape(dataMat)
274
275 for i in range(m):
276 kernelEval = kernelTrans(sVs, dataMat[i, :], kTup)
277 predict = kernelEval.T*np.multiply(labelSV, alphas[svInd]) + b
278 if np.sign(predict) != np.sign(labelArr[i]):
279 errorCount += 1
280 print('the test error rate is %f' % (float(errorCount)/m))
281
282
283 '''
284 ***************************************************************************
285 '''
286
287
288 def loadDataSet(filename):
289 # 该函数打开文件并对其进行逐行解析,从而得到每行的类标签和整个数据矩阵
290 dataMat = []
291 labelMat = []
292 fr = open(filename)
293 for line in fr.readlines():
294 lineArr = line.strip().split(' ')
295 dataMat.append([float(lineArr[0]), float(lineArr[1])])
296 labelMat.append(float(lineArr[2]))
297 return dataMat,labelMat
298
299
300 def selectJrand(i, m):
301 '''
302
303 :param i:第一个alpha的下标
304 :param m: 所有alpha的数目
305 :return:
306 '''
307 j = i
308 while j == i:
309 j = int(np.random.uniform(0, m)) # 随机选择不等于i的j值
310 return j
311
312
313 def clipAlpha(aj, H, L):
314 # 此函数用于调整大于H或小于L的alpha值
315 if aj > H:
316 aj = H
317 if aj < L:
318 aj = L
319 return aj
320
321
322 '''
323 SMO函数的伪代码大致如下:
324 创建一个alpha向量并将其初始化为0向量
325 当迭代次数小于最大迭代次数时(外循环):
326 对数据集中的每个数据向量(内循环):
327 如果该数据向量可以被优化:
328 随机选择另外一个数据向量
329 同时优化这两个向量
330 如果两个向量都不能被优化,退出内循环
331 如果所有向量都没被优化,增加迭代数目,继续下一次循环
332
333 '''
334
335
336 def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
337 '''
338
339 :param dataMatIn:数据集
340 :param classLabels: 类别标签
341 :param C: 常数C
342 :param toler: 容错率
343 :param maxIter: 最大循环次数
344 :return:
345 '''
346 dataMatrix = np.mat(dataMatIn) # 创建矩阵
347 labelMat = np.mat(classLabels).transpose() # 矩阵转置,变成列向量
348 b = 0
349 m, n = np.shape(dataMatrix) # 获取行和列
350 alphas = np.mat(np.zeros((m,1)))
351 iter = 0
352
353 while iter < maxIter:
354 alphaPairsChanged = 0 # 用于记录alpha是否已经进行优化
355 for i in range(m):
356 # fXi是预测的类别
357 fXi = float(np.multiply(alphas, labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b
358 # np.multiply为点乘,即对应元素相乘
359 Ei = fXi - float(labelMat[i]) # 预测结果和真实结果比对,计算误差Ei
360 if((labelMat[i] * Ei < -toler) and (alphas[i] < C)) or ((labelMat[i] * Ei > toler) and (alphas[i]>0)):
361 # 不管是正间隔还是负间隔都会被测试,也要同时检查alpha的值,以保证其不能等于0或C
362 j = selectJrand(i, m) # 随机选择第二个alpha值,即alpha[j]
363 fXj = float(np.multiply(alphas, labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b
364 Ej = fXj - float(labelMat[j]) # 计算第二个alpha的误差
365 alphaIold = alphas[i].copy()
366 alphaJold = alphas[j].copy()
367 if labelMat[i] != labelMat[j]:
368 L = max(0, alphas[j] - alphas[i])
369 H = min(C, C + alphas[j] - alphas[i])
370 else:
371 L = max(0, alphas[j] + alphas[i] - C)
372 H = min(C, alphas[j] + alphas[i])
373 # 以上将alpha[j]调整到0到C之间
374 if L == H:
375 print("L == H")
376 continue # 本次循环结束直接进行下一次for循环
377 eta = 2.0 * dataMatrix[i, :]*dataMatrix[j, :].T - dataMatrix[i, :]*dataMatrix[i,:].T -
378 dataMatrix[j,:]*dataMatrix[j,:].T
379 # eta是alpha[j]的最优修改量
380 if eta >= 0:
381 print(eta >= 0)
382 continue
383 alphas[j] -= labelMat[j]*(Ei - Ej)/eta
384 alphas[j] = clipAlpha(alphas[j], H, L) # 计算新的alpha[j]
385 if abs(alphas[j] - alphaJold) < 0.00001:
386 # 检查alpha[j]是否有轻微改变,如果是的话就退出for循环
387 print("j not moving enough")
388 continue
389 # 对alpha[i]进行修改,修改量与alpha[j]相同,但反向相反
390 alphas[i] += labelMat[j] * labelMat[i] * (alphaJold - alphas[j])
391 # 给两个alpha值设置常数项b
392 b1 = b - Ei - labelMat[i] * (alphas[i] - alphaIold) * dataMatrix[i,:]*dataMatrix[i,:].T -
393 labelMat[j]*(alphas[j] - alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T
394 b2 = b - Ej - labelMat[i] * (alphas[i] - alphaIold) * dataMatrix[i,:]*dataMatrix[j,:].T -
395 labelMat[j]*(alphas[j] - alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T
396 if (0 < alphas[i]) and (C > alphas[i]):
397 b = b1
398 elif (0 < alphas[j]) and (C > alphas[j]):
399 b = b2
400 else:
401 # 如果程序执行到for循环的最后一行都不执行continue语句,那么就已经成功地改变了一对alpha,增加alphaPairsChanged
402 b = (b1 + b2) / 2.0
403 alphaPairsChanged += 1
404 print('iter: %d i: %d, pairs changed %d'%(iter, i, alphaPairsChanged))
405 # 检查alpha值是否做了更新,如果有更新则将iter设为0后继续运行程序。
406 # 只有在所有数据集上遍历maxIter次,且不再发生任何alpha修改之后,程序才会停止并退出while循环
407 if alphaPairsChanged == 0:
408 iter += 1
409 else:
410 iter = 0
411 print('iteration number: %d' % iter)
412 return b, alphas
413
414
415 if __name__ == "__main__":
416 # testRbf()
417 # testDigits(('lin', 0))
418 testDigits(('rbf', 10))
419
420 '''
421 # Python中Numpy mat的使用 https://www.cnblogs.com/lfri/p/10561914.html
422 dataArr, labelArr = loadDataSet("../machinelearninginaction/Ch06/testSet.txt")
423 # print(dataArr)
424 print(labelArr)
425 # b, alphas = smoSimple(dataArr, labelArr, 0.6, 0.001, 40)
426 b, alphas = smoP(dataArr, labelArr, 0.6, 0.001, 40)
427 # print(b)
428 # print(alphas[alphas>0]) # 数组过滤,只对nunpy类型有用
429 print(np.shape(alphas[alphas>0])) # 支持向量的个数
430 for i in range(100):
431 # 输出支持向量
432 if alphas[i] > 0.0:
433 print(dataArr[i], labelArr[i])
434 ws = clacWs(alphas, dataArr, labelArr)
435 print(ws)
436 dataMat = np.mat(dataArr)
437 errorCount = 0
438 for k in range(np.shape(dataArr)[0]):
439 res = dataMat[k] * np.mat(ws) + b
440 if res < 0:
441 res = -1
442 else:
443 res = 1
444 if res != labelArr[k]:
445 errorCount += 1
446 means = errorCount/(np.shape(dataArr)[0])
447 print('error rate= %.5f'%means)
448 '''