• 机器学习:支持向量机识别手写英文字母 SMO算法实现二元分类器


    本文只构建一个能够识别一种英文符号的SVM,在此选择了C字符。

    一、SVM构建

    import numpy as np
    
    class SMO:
        def __init__(self, C, tol, kernel='rbf', gamma=None):
            # 惩罚系数
            self.C = C
            # 优化过程中alpha步进阈值
            self.tol = tol
    
            # 核函数
            if kernel == 'rbf':
                self.K = self._gaussian_kernel
                self.gamma = gamma
            else:
                self.K = self._linear_kernel
    
        def _gaussian_kernel(self, U, v):
            '''高斯核函数'''
            if U.ndim == 1:
                p = np.dot(U - v, U - v)
            else:
                p = np.sum((U - v) * (U - v), axis=1)
            return np.exp(-self.gamma * p)
        
        def _linear_kernel(self, U, v):
            '''线性核函数'''
            return np.dot(U, v)
    
        def _g(self, x):
            '''函数g(x)'''
            alpha, b, X, y, E = self.args
    
            idx = np.nonzero(alpha > 0)[0]
            if idx.size > 0:
                return np.sum(y[idx] * alpha[idx] * self.K(X[idx], x)) + b[0]
            return b[0]
    
        def _optimize_alpha_i_j(self, i, j):
            '''优化alpha_i, alpha_j'''
            alpha, b, X, y, E = self.args
            C, tol, K = self.C, self.tol, self.K
    
            # 优化需有两个不同alpha
            if i == j:
                return 0
    
            # 计算alpha[j]的边界
            if y[i] != y[j]:
                L = max(0, alpha[j] - alpha[i])
                H = min(C, C + alpha[j] - alpha[i])
            else:
                L = max(0, alpha[j] + alpha[i] - C)
                H = min(C, alpha[j] + alpha[i])
    
            # L == H 时已无优化空间(一个点).
            if L == H:
                return 0
    
            # 计算eta
            eta = K(X[i], X[i]) + K(X[j], X[j]) - 2 * K(X[i], X[j])
            if eta <= 0:
                return 0
    
            # 对于alpha非边界, 使用E缓存. 边界alpha, 动态计算E.
            if 0 < alpha[i] < C:
                E_i = E[i]
            else:
                E_i = self._g(X[i]) - y[i]
    
            if 0 < alpha[j] < C:
                E_j = E[j]
            else:
                E_j = self._g(X[j]) - y[j]
            
            # 计算alpha_j_new
            alpha_j_new = alpha[j] +  y[j] * (E_i - E_j) / eta
    
            # 对alpha_j_new进行剪辑
            if alpha_j_new > H:
                alpha_j_new = H
            elif alpha_j_new < L:
                alpha_j_new = L
            alpha_j_new = np.round(alpha_j_new, 7)
    
            # 判断步进是否足够大
            if np.abs(alpha_j_new - alpha[j]) < tol * (alpha_j_new + alpha[j] + tol):
                return 0
    
            # 计算alpha_i_new
            alpha_i_new = alpha[i] + y[i] * y[j] * (alpha[j] - alpha_j_new)
            alpha_i_new = np.round(alpha_i_new, 7)
    
            # 计算b_new
            b1 = b[0] - E_i 
                    -y[i] * (alpha_i_new - alpha[i]) * K(X[i], X[i]) 
                    -y[j] * (alpha_j_new - alpha[j]) * K(X[i], X[j])
    
            b2 = b[0] - E_j 
                    -y[i] * (alpha_i_new - alpha[i]) * K(X[i], X[j]) 
                    -y[j] * (alpha_j_new - alpha[j]) * K(X[j], X[j])
    
            if 0 < alpha_i_new < C:
                b_new = b1
            elif 0 < alpha_j_new < C:
                b_new = b2
            else:
                b_new = (b1 + b2) / 2
    
            # 更新E缓存
            # 更新E[i],E[j]. 若优化后alpha若不在边界, 缓存有效且值为0.
            E[i] = E[j] = 0
            # 更新其他非边界alpha对应的E[k]
            mask = (alpha != 0) & (alpha != C)
            mask[i] = mask[j] = False
            non_bound_idx = np.nonzero(mask)[0]
            for k in non_bound_idx:
                E[k] += b_new - b[0] + y[i] * K(X[i], X[k]) * (alpha_i_new - alpha[i]) 
                                     + y[j] * K(X[j], X[k]) * (alpha_j_new - alpha[j])
    
            # 更新alpha_i, alpha_i
            alpha[i] = alpha_i_new
            alpha[j] = alpha_j_new
    
            # 更新b
            b[0] = b_new
    
            return 1
    
        def _optimize_alpha_i(self, i):
            '''优化alpha_i, 内部寻找alpha_j.'''
            alpha, b, X, y, E = self.args
    
            # 对于alpha非边界, 使用E缓存. 边界alpha, 动态计算E.
            if 0 < alpha[i] < self.C:
                E_i = E[i]
            else:
                E_i = self._g(X[i]) - y[i]
    
            # alpha_i仅在违反KKT条件时进行优化.
            if (E_i * y[i] < -self.tol and alpha[i] < self.C) or 
                    (E_i * y[i] > self.tol and alpha[i] > 0):
                # 按优先级次序选择alpha_j.
    
                # 分别获取非边界alpha和边界alpha的索引
                mask = (alpha != 0) & (alpha != self.C)
                non_bound_idx = np.nonzero(mask)[0]
                bound_idx = np.nonzero(~mask)[0]
    
                # 优先级(-1)
                # 若非边界alpha个数大于1, 寻找使得|E_i - E_j|最大化的alpha_j.
                if len(non_bound_idx) > 1:
                    if E[i] > 0:
                        j = np.argmin(E[non_bound_idx])
                    else:
                        j = np.argmax(E[non_bound_idx])
    
                    if self._optimize_alpha_i_j(i, j):
                        return 1
    
                # 优先级(-2)
                # 随机迭代非边界alpha
                np.random.shuffle(non_bound_idx)
                for j in non_bound_idx:
                    if self._optimize_alpha_i_j(i, j):
                        return 1
    
                # 优先级(-3)
                # 随机迭代边界alpha
                np.random.shuffle(bound_idx)
                for j in bound_idx:
                    if self._optimize_alpha_i_j(i, j):
                        return 1
    
            return 0
    
        def train(self, X_train, y_train):
            '''训练'''
            m, _ = X_train.shape
    
            # 初始化向量alpha和标量b
            alpha = np.zeros(m)
            b = np.zeros(1)
    
            # 创建E缓存
            E = np.zeros(m)
    
            # 将各方法频繁使用的参数收集到列表, 供调用时传递.
            self.args = [alpha, b, X_train, y_train, E]
    
            n_changed = 0
            examine_all = True
            while n_changed > 0 or examine_all:
                n_changed = 0
    
                # 迭代alpha_i
                for i in range(m):
                    if examine_all or 0 < alpha[i] < self.C:
                        n_changed += self._optimize_alpha_i(i)
    
                print('n_changed: %s' % n_changed)
                print('sv num: %s' % np.count_nonzero((alpha > 0) & (alpha < self.C)))
    
                # 若当前迭代非边界alpha, 且没有alpha改变, 下次迭代所有alpha.
                # 否则, 下次迭代非边界间alpha.
                examine_all = (not examine_all) and (n_changed == 0)
    
            # 训练完成后保存模型参数: 
            idx = np.nonzero(alpha > 0)[0]
            # 1.非零alpha 
            self.sv_alpha = alpha[idx] 
            # 2.支持向量, 
            self.sv_X = X_train[idx]
            self.sv_y = y_train[idx]
            # 3.b.
            self.sv_b = b[0]
    
        def _predict_one(self, x):
            '''对单个输入进行预测'''
            k = self.K(self.sv_X, x)
            return np.sum(self.sv_y * self.sv_alpha * k) + self.sv_b
    
        def predict(self, X_test):
            '''预测'''
            y_pred = np.apply_along_axis(self._predict_one, 1, X_test)
            return np.squeeze(np.where(y_pred > 0, 1., -1.))

    二、数据集加载与变换

    http://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/
    import numpy as np
    X=np.genfromtxt('F:/python_test/data/letter-recognition.data',delimiter=',',usecols=range(1,17))
    print(X)
    print(X.shape)

    y=np.genfromtxt('F:/python_test/data/letter-recognition.data',delimiter=',',usecols=0,dtype=np.str)
    print(y)
    y=np.where(y == 'C',1,-1)
    print(y)

     三、训练模型

    #选择惩罚系数,步长阈值,核函数,高斯核函数的gamma(绝对带宽大小)
    clf=SMO(C=1,tol=0.01,kernel='rbf',gamma=0.01)
    from sklearn.model_selection import train_test_split
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
    clf.train(X_train,y_train)
    from sklearn.metrics import accuracy_score
    y_pred=clf.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    print(accuracy)

    虽然准确度达到了0.99,但是要明确数据集中的C大约是由5%,全部预测成-1都会有95%的准确度,所以可以观察一下把C分类混淆矩阵

    可以发现没有将一个不是C的字符认为是C,将37个C认为是其他字符。总体准确度还需要调参来提高

    from sklearn.metrics import confusion_matrix
    C=confusion_matrix(y_test,y_pred)
    print(C)

    混淆矩阵

    四、调节超参数的组合来提高SVM分类的质量

    acc_list=[]
    p_list=[]
    
    #超参数列表,这里设置几个粗略的值
    c_list=[0.1,1,5,10,15,20]
    gamma_list=[0.1,0.2,0.3,0.4,0.5]
    for C in c_list:
        for gamma in gamma_list:
            clf=SMO(C=C,tol=0.01,kernel='rbf',gamma=gamma)
            
            #训练预测并且计算精确度
            clf.train(X_train,y_train)
            y_pred=clf.predict(X_test)
            accuracy=accuracy_score(y_test,y_pred)
            
            #保存精确度以及相应的参数
            acc_list.append(accuracy)
            p_list.append((C,gamma))
            
    idx=np.argmax(acc_list)
    print(p_list(idx))

    最后可以自己查看在那种情况下分类结果最好

    clf=SMO(C=5,tol=0.01,kernel='rbf',gamma=0.05)
    clf.train(X_train,y_train)
    y_pred=clf.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    print(accuracy)
  • 相关阅读:
    python2代码改成python3踩过的坑
    Mac下为什么有的文件名后带一个* 星号?
    Mac 的 Vim 中 delete 键失效的原因和解决方案(转)
    使用pandas处理大型CSV文件(转)
    Java基础——02
    javaee相关基础
    Cookie&Session笔记
    EL&JSTL笔记------jsp
    JavaWeb基础
    Java基础——01
  • 原文地址:https://www.cnblogs.com/randy-lo/p/13214434.html
Copyright © 2020-2023  润新知