• 【机器学习实战】第六章--支持向量机


      1 import numpy as np
      2 import os
      3 
      4 
      5 class optStruct:
      6     # 建立一个数据结构来保存所有重要的值,仅包含__init__方法,该方法可以实现其成员变量的填充
      7     def __init__(self, dataMatIn, classLabels, C, toler, kTup):
      8         # kTup是一个包含核函数信息的元组
      9         self.X = dataMatIn
     10         self.labelMat = classLabels
     11         self.C = C
     12         self.tol = toler
     13         self.m = np.shape(dataMatIn)[0]
     14         self.alphas = np.mat(np.zeros((self.m, 1)))
     15         self.b = 0
     16         # eCache的第一列给出的是eCache是否有效的标志位,第二列给出的是实际的E值
     17         self.eCache = np.mat(np.zeros((self.m, 2)))
     18         self.K = np.mat(np.zeros((self.m, self.m)))
     19         for i in range(self.m):
     20             self.K[:, i] = kernelTrans(self.X, self.X[i, :], kTup)
     21 
     22 
     23 def calcEk(oS, k):
     24     # 计算E值并返回
     25     # fXk = float(np.multiply(oS.alphas, oS.labelMat).T*(oS.X*oS.X[k,:].T)) + oS.b
     26     fXk = float(np.multiply(oS.alphas, oS.labelMat).T * oS.K[:, k] + oS.b)
     27     Ek = fXk - float(oS.labelMat[k])
     28     return Ek
     29 
     30 
     31 def selectJ(i, oS, Ei):
     32     # 用于选择第二个alpha或者说内循环的alpha值,这里的目标是选择合适的第二个alpha值以保证在每次优化中采用最大步长
     33     # 该函数的误差值与第一个alpha值Ei和下标i有关
     34     maxK = -1
     35     maxDeltaE = 0
     36     Ej = 0
     37     oS.eCache[i] = [1, Ei]
     38     validEcacheList = np.nonzero(oS.eCache[:, 0].A)[0]  # 构建一个非零表。返回的是非零E值所对应的alpha值
     39     # np.nonzero函数是numpy中用于得到数组array中非零元素的位置(数组索引)的函数。
     40     if len(validEcacheList) > 1:
     41         for k in validEcacheList:
     42             if k == i:
     43                 continue
     44             Ek = calcEk(oS, k)
     45             deltaE = abs(Ei - Ek)
     46             if deltaE > maxDeltaE:
     47                 maxK = k
     48                 maxDeltaE = deltaE
     49                 Ej = Ek
     50                 # 选择具有最大步长的j
     51         return maxK, Ej
     52     else:
     53         j = selectJrand(i, oS.m)
     54         Ej = calcEk(oS, j)
     55     return j, Ej
     56 
     57 
     58 def updateEk(oS, k):
     59     # 计算误差值并存入缓存当中。在对alpha值进行优化之后会用到这个值
     60     Ek = calcEk(oS, k)
     61     oS.eCache[k] = [1, Ek]
     62 
     63 
     64 def innerL(i, oS):
     65     Ei = calcEk(oS, i)
     66     if ((oS.labelMat[i]*Ei<-oS.tol) and (oS.alphas[i]<oS.C)) or ((oS.labelMat[i]*Ei>oS.tol) and (oS.alphas[i]>0)):
     67         j, Ej = selectJ(i, oS, Ei)
     68         alphaIold = oS.alphas[i].copy()
     69         alphaJold = oS.alphas[j].copy()
     70         if oS.labelMat[i] != oS.labelMat[j]:
     71             L = max(0, oS.alphas[j] - oS.alphas[i])
     72             H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
     73         else:
     74             L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
     75             H = min(oS.C, oS.alphas[j] + oS.alphas[i])
     76         if L == H:
     77             print('L == H')
     78             return 0
     79         # eta = 2.0*oS.X[i,:]*oS.X[j,:].T-oS.X[i,:]*oS.X[i,:].T-oS.X[j,:]*oS.X[j,:].T
     80         eta = 2.0*oS.K[i,j]-oS.K[i,i]-oS.K[j,j]
     81         if eta >= 0:
     82             print('eta>=0')
     83             return 0
     84         oS.alphas[j] -= oS.labelMat[j]*(Ei-Ej)/eta
     85         oS.alphas[j] = clipAlpha(oS.alphas[j],H,L)
     86         updateEk(oS,j)
     87         if abs(oS.alphas[j] - alphaJold) < 0.00001:
     88             print('j not moving enough')
     89             return 0
     90         oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i]*(alphaJold - oS.alphas[j])
     91         updateEk(oS, i)
     92         # b1 = oS.b - Ei - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[i,:].T-oS.labelMat[j]*
     93              # (oS.alphas[j]-alphaJold)*oS.X[i,:]*oS.X[j,:].T
     94         # b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[j,:].T-oS.labelMat[j]*
     95              # (oS.alphas[j]-alphaJold)*oS.X[j,:]*oS.X[j,:].T
     96         b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, i] - oS.labelMat[j] * 
     97             (oS.alphas[j]-alphaJold)*oS.K[i,j]
     98         b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]-oS.labelMat[j]*
     99             (oS.alphas[j]-alphaJold)*oS.K[j,j]
    100         if (0<oS.alphas[i]) and (oS.C>oS.alphas[i]):
    101             oS.b = b1
    102         elif (0<oS.alphas[j]) and (oS.C>oS.alphas[j]):
    103             oS.b = b2
    104         else:
    105             oS.b = (b1+b2)/2.0
    106         return 1
    107     else:
    108         return 0
    109 
    110 
    111 def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup=('lin', 0)):
    112     # 实例化对象oS,用来容纳所有数据
    113     oS = optStruct(np.mat(dataMatIn), np.mat(classLabels).transpose(),C,toler, kTup)
    114     iter = 0
    115     entireSet = True
    116     alphaPairsChanged = 0
    117     while iter < maxIter and (alphaPairsChanged >0 or entireSet):
    118         # 当迭代次数超过指定的最大值,或者遍历整个集合都未对任意alpha对进行修改时,就退出循环
    119         alphaPairsChanged = 0
    120         if entireSet:
    121             for i in range(oS.m):
    122                 alphaPairsChanged += innerL(i, oS)
    123                 print('fullSet, iter: %d i: %d, pairs changed %d'%(iter, i, alphaPairsChanged))
    124             iter += 1
    125         else:
    126             nonBoundsIs = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
    127             # matrix.A是将矩阵转换为ndarray
    128             # >>> type(c)
    129             # <class 'numpy.matrix'>
    130             # >>> type(c.A)
    131             # <class 'numpy.ndarray'>
    132             for i in nonBoundsIs:
    133                 print('non-bound,iter: %d i: %d, pairs changed %d' % (iter, i, alphaPairsChanged))
    134             iter += 1
    135         if entireSet:
    136             entireSet = False
    137         elif alphaPairsChanged == 0:
    138             entireSet = True
    139         print('iteration number: %d' % iter)
    140     return oS.b, oS.alphas
    141 
    142 
    143 def clacWs(alphas, dataArr, classLabels):
    144     X = np.mat(dataArr)
    145     labelMat = np.mat(classLabels).transpose()
    146     m, n = np.shape(X)
    147     w = np.zeros((n, 1))
    148     for i in range(m):
    149         w += np.multiply(alphas[i] * labelMat[i], X[i,:].T)
    150     return w
    151 
    152 
    153 def kernelTrans(X, A, kTup):
    154     '''
    155 
    156     :param X: 所有数据集
    157     :param A: 数据集中的一行
    158     :param kTup: 元组kTup给出的是核函数的信息,元组的第一个参数是描述所用核函数类型的一个字符串,
    159                 其他两个参数则都是核函数可能需要的可选参数
    160     :return:
    161     '''
    162     m, n = np.shape(X)
    163     K = np.mat(np.zeros((m, 1)))
    164     if kTup[0] == 'lin':
    165         # 线性核函数
    166         K = X * A.T
    167     elif kTup[0] == 'rbf':
    168         # 径向基核函数
    169         for j in range(m):
    170             deltaRow = X[j, :] - A
    171             K[j] = deltaRow * deltaRow.T
    172         K = np.exp(K / (-1*kTup[1]**2))
    173     else:
    174         raise NameError('Houston We Have a Problem -- That Kernel is not recognized')
    175     return K
    176 
    177 
    178 def testRbf(k1 = 1.3):
    179     '''
    180 
    181     :param k1: 高斯径向基函数中的一个用户定义变量
    182     :return:
    183     '''
    184     # 读入数据集
    185     dataArr, labelArr = loadDataSet("../machinelearninginaction/Ch06/testSetRBF.txt")
    186     # 运行Platt Smo算法,核函数类型为rbf
    187     b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1))
    188     # 建立数据矩阵副本
    189     dataMat = np.mat(dataArr)
    190     labelMat = np.mat(labelArr).transpose()
    191     # 找出非零的alpha值
    192     svInd = np.nonzero(alphas.A > 0)[0]
    193     # 得到所需要的支持向量
    194     sVs = dataMat[svInd]
    195     # 得到类别标签值
    196     labelSV = labelMat[svInd]
    197     print('there are %d Support Vectors' % np.shape(sVs)[0])
    198     m, n = np.shape(dataMat)
    199     errorCount = 0
    200     for i in range(m):
    201         # 利用核函数进行分类
    202         kernelEval = kernelTrans(sVs, dataMat[i, :], ('rbf', k1))  # 得到转换后的数据
    203         predict = kernelEval.T * np.multiply(labelSV, alphas[svInd]) + b
    204         if np.sign(predict) != np.sign(labelArr[i]):
    205             # sign()是Python的Numpy中的取数字符号(数字前的正负号)的函数。大于0时取1,小于0时取-1,等于0时取0
    206             errorCount += 1
    207     print('the training error rate is: %f' % (float(errorCount)/m))
    208     dataArr, labelArr = loadDataSet("../machinelearninginaction/Ch06/testSetRBF2.txt")
    209     errorCount = 0
    210     dataMat = np.mat(dataArr)
    211     labelMat = np.mat(labelArr).transpose()
    212     m, n = np.shape(dataMat)
    213     for i in range(m):
    214         kernelEval = kernelTrans(sVs, dataMat[i, :], ('rbf', k1))
    215         predict = kernelEval.T * np.multiply(labelSV, alphas[svInd]) + b
    216         if np.sign(predict) != np.sign(labelArr[i]):
    217             errorCount += 1
    218     print('the test error rate is: %f' % (float(errorCount)/m))
    219 
    220 
    221 '''
    222 ***************************************************************************
    223 '''
    224 # 手写数字识别
    225 def img2vector(filename):
    226     returnVect = np.zeros((1, 1024))  # 创建1行1024列的零数组
    227     fr = open(filename)
    228     for i in range(32):
    229         lineStr = fr.readline()
    230         for j in range(32):
    231             returnVect[0, 32*i+j] = int(lineStr[j])  # 将32x32的图片转换成1x1024的行向量
    232     return returnVect
    233 
    234 
    235 def loadImages(dirName):
    236     hwLabels = []
    237     trainingFileList = os.listdir(dirName)  # 返回包含文件夹下的所有文件名的列表
    238     m = len(trainingFileList)  # 获取所有文件的个数
    239     trainingMat = np.zeros((m, 1024))
    240     for i in range(m):
    241         fileNameStr = trainingFileList[i]
    242         fileStr = fileNameStr.split('.')[0]
    243         classNumStr = int(fileStr.split('_')[0])
    244         # 二分类,只识别9
    245         if classNumStr == 9:
    246             hwLabels.append(-1)
    247         else:
    248             hwLabels.append(1)
    249         trainingMat[i, :] = img2vector('%s/%s'%(dirName, fileNameStr))
    250     return trainingMat, hwLabels
    251 
    252 
    253 def testDigits(kTup=('rbf', 10)):
    254     dataArr, labelArr = loadImages('../machinelearninginaction/Ch02/digits/trainingDigits')
    255     b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup)
    256     dataMat = np.mat(dataArr)
    257     labelMat = np.mat(labelArr).transpose()
    258     svInd = np.nonzero(alphas.A>0)[0]
    259     sVs = dataMat[svInd]
    260     labelSV = labelMat[svInd]
    261     print('there are %d Support Vectors' % np.shape(sVs)[0])
    262     m, n = np.shape(dataMat)
    263     errorCount = 0
    264     for i in range(m):
    265         kernelEval = kernelTrans(sVs, dataMat[i, :], kTup)
    266         predict = kernelEval.T*np.multiply(labelSV, alphas[svInd]) + b
    267         if np.sign(predict) != np.sign(labelArr[i]):
    268             errorCount += 1
    269     print('the training error rate is %f' % (float(errorCount)/m))
    270     dataArr, labelArr = loadImages('../machinelearninginaction/Ch02/digits/testDigits')
    271     dataMat = np.mat(dataArr)
    272     labelMat = np.mat(labelArr).transpose()
    273     m, n = np.shape(dataMat)
    274 
    275     for i in range(m):
    276         kernelEval = kernelTrans(sVs, dataMat[i, :], kTup)
    277         predict = kernelEval.T*np.multiply(labelSV, alphas[svInd]) + b
    278         if np.sign(predict) != np.sign(labelArr[i]):
    279             errorCount += 1
    280     print('the test error rate is %f' % (float(errorCount)/m))
    281 
    282 
    283 '''
    284 ***************************************************************************
    285 '''
    286 
    287 
    288 def loadDataSet(filename):
    289     # 该函数打开文件并对其进行逐行解析,从而得到每行的类标签和整个数据矩阵
    290     dataMat = []
    291     labelMat = []
    292     fr = open(filename)
    293     for line in fr.readlines():
    294         lineArr = line.strip().split('	')
    295         dataMat.append([float(lineArr[0]), float(lineArr[1])])
    296         labelMat.append(float(lineArr[2]))
    297     return dataMat,labelMat
    298 
    299 
    300 def selectJrand(i, m):
    301     '''
    302 
    303     :param i:第一个alpha的下标
    304     :param m: 所有alpha的数目
    305     :return:
    306     '''
    307     j = i
    308     while j == i:
    309         j = int(np.random.uniform(0, m))  # 随机选择不等于i的j值
    310     return j
    311 
    312 
    313 def clipAlpha(aj, H, L):
    314     # 此函数用于调整大于H或小于L的alpha值
    315     if aj > H:
    316         aj = H
    317     if aj < L:
    318         aj = L
    319     return aj
    320 
    321 
    322 '''
    323 SMO函数的伪代码大致如下:
    324     创建一个alpha向量并将其初始化为0向量
    325     当迭代次数小于最大迭代次数时(外循环):
    326         对数据集中的每个数据向量(内循环):
    327             如果该数据向量可以被优化:
    328                 随机选择另外一个数据向量
    329                 同时优化这两个向量
    330                 如果两个向量都不能被优化,退出内循环
    331         如果所有向量都没被优化,增加迭代数目,继续下一次循环
    332 
    333 '''
    334 
    335 
    336 def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
    337     '''
    338 
    339     :param dataMatIn:数据集
    340     :param classLabels: 类别标签
    341     :param C: 常数C
    342     :param toler: 容错率
    343     :param maxIter: 最大循环次数
    344     :return:
    345     '''
    346     dataMatrix = np.mat(dataMatIn)  # 创建矩阵
    347     labelMat = np.mat(classLabels).transpose()  # 矩阵转置,变成列向量
    348     b = 0
    349     m, n = np.shape(dataMatrix)  # 获取行和列
    350     alphas = np.mat(np.zeros((m,1)))
    351     iter = 0
    352 
    353     while iter < maxIter:
    354         alphaPairsChanged = 0  # 用于记录alpha是否已经进行优化
    355         for i in range(m):
    356             # fXi是预测的类别
    357             fXi = float(np.multiply(alphas, labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b
    358             # np.multiply为点乘,即对应元素相乘
    359             Ei = fXi - float(labelMat[i])  # 预测结果和真实结果比对,计算误差Ei
    360             if((labelMat[i] * Ei < -toler) and (alphas[i] < C)) or ((labelMat[i] * Ei > toler) and (alphas[i]>0)):
    361                 # 不管是正间隔还是负间隔都会被测试,也要同时检查alpha的值,以保证其不能等于0或C
    362                 j = selectJrand(i, m)  # 随机选择第二个alpha值,即alpha[j]
    363                 fXj = float(np.multiply(alphas, labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b
    364                 Ej = fXj - float(labelMat[j])  # 计算第二个alpha的误差
    365                 alphaIold = alphas[i].copy()
    366                 alphaJold = alphas[j].copy()
    367                 if labelMat[i] != labelMat[j]:
    368                     L = max(0, alphas[j] - alphas[i])
    369                     H = min(C, C + alphas[j] - alphas[i])
    370                 else:
    371                     L = max(0, alphas[j] + alphas[i] - C)
    372                     H = min(C, alphas[j] + alphas[i])
    373                     # 以上将alpha[j]调整到0到C之间
    374                 if L == H:
    375                     print("L == H")
    376                     continue  # 本次循环结束直接进行下一次for循环
    377                 eta = 2.0 * dataMatrix[i, :]*dataMatrix[j, :].T - dataMatrix[i, :]*dataMatrix[i,:].T - 
    378                       dataMatrix[j,:]*dataMatrix[j,:].T
    379                 # eta是alpha[j]的最优修改量
    380                 if eta >= 0:
    381                     print(eta >= 0)
    382                     continue
    383                 alphas[j] -= labelMat[j]*(Ei - Ej)/eta
    384                 alphas[j] = clipAlpha(alphas[j], H, L)  # 计算新的alpha[j]
    385                 if abs(alphas[j] - alphaJold) < 0.00001:
    386                     # 检查alpha[j]是否有轻微改变,如果是的话就退出for循环
    387                     print("j not moving enough")
    388                     continue
    389                 # 对alpha[i]进行修改,修改量与alpha[j]相同,但反向相反
    390                 alphas[i] += labelMat[j] * labelMat[i] * (alphaJold - alphas[j])
    391                 # 给两个alpha值设置常数项b
    392                 b1 = b - Ei - labelMat[i] * (alphas[i] - alphaIold) * dataMatrix[i,:]*dataMatrix[i,:].T - 
    393                      labelMat[j]*(alphas[j] - alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T
    394                 b2 = b - Ej - labelMat[i] * (alphas[i] - alphaIold) * dataMatrix[i,:]*dataMatrix[j,:].T - 
    395                      labelMat[j]*(alphas[j] - alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T
    396                 if (0 < alphas[i]) and (C > alphas[i]):
    397                     b = b1
    398                 elif (0 < alphas[j]) and (C > alphas[j]):
    399                     b = b2
    400                 else:
    401                     # 如果程序执行到for循环的最后一行都不执行continue语句,那么就已经成功地改变了一对alpha,增加alphaPairsChanged
    402                     b = (b1 + b2) / 2.0
    403                     alphaPairsChanged += 1
    404                     print('iter: %d i: %d, pairs changed %d'%(iter, i, alphaPairsChanged))
    405         # 检查alpha值是否做了更新,如果有更新则将iter设为0后继续运行程序。
    406         # 只有在所有数据集上遍历maxIter次,且不再发生任何alpha修改之后,程序才会停止并退出while循环
    407         if alphaPairsChanged == 0:
    408             iter += 1
    409         else:
    410             iter = 0
    411         print('iteration number: %d' % iter)
    412     return b, alphas
    413 
    414 
    415 if __name__ == "__main__":
    416     # testRbf()
    417     # testDigits(('lin', 0))
    418     testDigits(('rbf', 10))
    419 
    420     '''
    421     # Python中Numpy mat的使用 https://www.cnblogs.com/lfri/p/10561914.html
    422     dataArr, labelArr = loadDataSet("../machinelearninginaction/Ch06/testSet.txt")
    423     # print(dataArr)
    424     print(labelArr)
    425     # b, alphas = smoSimple(dataArr, labelArr, 0.6, 0.001, 40)
    426     b, alphas = smoP(dataArr, labelArr, 0.6, 0.001, 40)
    427     # print(b)
    428     # print(alphas[alphas>0])  # 数组过滤,只对nunpy类型有用
    429     print(np.shape(alphas[alphas>0]))  # 支持向量的个数
    430     for i in range(100):
    431         # 输出支持向量
    432         if alphas[i] > 0.0:
    433             print(dataArr[i], labelArr[i])
    434     ws = clacWs(alphas, dataArr, labelArr)
    435     print(ws)
    436     dataMat = np.mat(dataArr)
    437     errorCount = 0
    438     for k in range(np.shape(dataArr)[0]):
    439         res = dataMat[k] * np.mat(ws) + b
    440         if res < 0:
    441             res = -1
    442         else:
    443             res = 1
    444         if res != labelArr[k]:
    445             errorCount += 1
    446     means = errorCount/(np.shape(dataArr)[0])
    447     print('error rate= %.5f'%means)
    448     '''
    1 there are 328 Support Vectors
    2 the training error rate is 0.000000
    3 the test error rate is 0.006342
  • 相关阅读:
    Unable To Open Database After ASM Upgrade From Release 11.1 To Release 11.2
    11g Understanding Automatic Diagnostic Repository.
    How to perform Rolling UpgradeDowngrade in 11g ASM
    Oracle 11.2.0.2 Patch 说明
    Pattern Matching Metacharacters For asm_diskstring
    Steps To MigrateMove a Database From NonASM to ASM And ViceVersa
    Upgrading ASM instance from Oracle 10.1 to Oracle 10.2. (Single Instance)
    OCSSD.BIN Process is Running in a NonRAC Environment
    Steps To MigrateMove a Database From NonASM to ASM And ViceVersa
    On RAC, expdp Removes the Service Name [ID 1269319.1]
  • 原文地址:https://www.cnblogs.com/DJames23/p/13304388.html
Copyright © 2020-2023  润新知