• SVM--交叉验证


    `# -- coding: utf-8 --
    """SVM.ipynb

    Automatically generated by Colaboratory.

    Original file is located at
    https://colab.research.google.com/drive/1a993aXFZd3z39U7eqp6J0Ndvuhrzu1-q
    """

    import numpy as np
    import time
    from scipy.stats import sem
    from sklearn.model_selection import train_test_split,cross_val_score
    from sklearn.model_selection import KFold,GridSearchCV
    from sklearn.svm import SVC
    from sklearn.metrics import classification_report #对分类的结果进行综合性的报告
    from sklearn.metrics import confusion_matrix
    from sklearn.datasets import fetch_olivetti_faces #用sklearn自带的数据集 400张人脸

    import matplotlib

    matplotlib.use('TkAgg')

    import matplotlib.pyplot as plt

    faces = fetch_olivetti_faces()
    print(faces.DESCR)
    print(faces.keys())
    print(faces.images.shape)
    print(faces.data.shape)
    print(faces.target.shape)
    print("max",np.max(faces.data))
    print(np.min(faces.data))
    print(np.mean(faces.data))

    """执行过程:SVC --> BaseSVC --> BaseLibSVM --> init初始化参数
    SVC:多分类器

    • C: C-SVC的惩罚参数C,默认值是1.0
    • kernel :核函数,默认是rbf,可为如下:
      – 'linear',线性:u'v
      – 'poly',多项式:(gammau'v + coef0)^degree
      – 'rbf',RBF函数:exp(-gamma|u-v|^2)
      –'sigmoid':tanh(gammau'v + coef0)
    • degree :多项式poly函数的维度,默认是3,其他核函数会被忽略
    • gamma : ‘rbf’,‘poly’ 和‘sigmoid’的核函数参数。默认是’auto’,则会选择1/- - n_features
    • coef0 :核函数的常数项,对于‘poly’和 ‘sigmoid’有用
    • max_iter :最大迭代次数,-1为无限制
    • tol :停止训练的误差值大小,默认为1e-3
    • decision_function_shape :’ovo‘, ‘ovr’ or None, default=‘ovr’(one vs rest)
    • random_state :随机数种子,数据洗牌时的种子值,int值
      主要调节的参数有:C、kernel、degree、gamma、coef0。
      """

    交叉验证

    def evaluate_cross_validation(clf,X,y,K):

    create a k-fold cross validation iterator

    cv = KFold(K,shuffle=True,random_state=0)

    score method of the estimator (accuracy)

    scores = cross_val_score(clf,X,y,cv=cv)

    这里的clf==之前建的SVC

    print(scores)
    print("Mean score: {0:.3f} (+/-{1:.3f})".format(
    np.mean(scores), sem(scores)))

    """KFold:sklearn/model_selection/_split.py
    KFold(n_split, shuffle, random_state)

    • n_split:要划分的折数
    • shuffle: 每次划分前,是否对数据进行shuffle洗牌打乱
    • random_state:数据打乱的随机数种子

    KFold过程

    • 1、将数据集平均分割成K等份
    • 2、使用1份数据作为测试数据,其余K-1份作为训练数据
    • 3、计算测试准确率
    • 4、使用不同的测试集,重复上面步骤

    scores:sklearn/model_selection/_validation.py
    作用:验证某个模型在某个训练集上的稳定性,输出k个预测精度。

    输入:clf:分类器;X:数据;y:标签;cv:交叉验证,可为k或KFold
    输出:k个划分的预测结果(分类准确率)

    执行过程:调用文件内的cross_validate函数,启用多个job线程并行,每个线程处理cv分割出的一份训练数据和验证数据。

    线程调用文件内_fit_and_score函数,_fit_and_score使用分类器的estimator.fit(X_train, y_train)进行训练,使用文件内_score函数计算预测结果。

    """

    不用交叉验证

    def train_and_evaluate(clf,X_train,X_test,y_train,y_test):
    clf.fit(X_train,y_train)
    print("Accuracy on training set:")
    print(clf.score(X_train, y_train))
    print("Accuracy on testing set:")
    print(clf.score(X_test, y_test))

    y_pred = clf.predict(X_test)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    from google.colab import drive
    drive.mount('/content/drive')

    """作用:根据真实值和预测值计算分类精度的综合报告

    输入:y_true:1 维数组,真实数据的分类标签
    y_pred:1 维数组,模型预测的分类标签

    输出:每个分类标签的精确度,召回率和 F1-score。

    精确度:precision,正确预测为正的,占全部预测为正的比例,TP / (TP+FP)
    

    召回率:recall,正确预测为正的,占全部实际为正的比例,TP / (TP+FN)

    F1-score:精确率和召回率的调和平均数,2 * p*r / (p+r)
    

    """

    戴眼镜的人的标注

    # the index ranges of images of people with glasses
    

    glasses = [
    (10, 19), (30, 32), (37, 38), (50, 59), (63, 64),
    (69, 69), (120, 121), (124, 129), (130, 139), (160, 161),
    (164, 169), (180, 182), (185, 185), (189, 189), (190, 192),
    (194, 194), (196, 199), (260, 269), (270, 279), (300, 309),
    (330, 339), (358, 359), (360, 369)
    ]

    戴眼镜的标为1,不戴眼镜的标为0

    def create_target(num_sample,segments):
    y = np.zeros(num_sample)
    for (start,end) in segments:
    y[start:end+1] = 1
    return y

    num_samples = faces.target.shape[0]
    target_glasses = create_target(num_samples,glasses)

    1 sklearn.model_selection import train_test_split

    svc_1 = SVC(kernel = 'linear')
    print(svc_1)

    原始的:faces.target

    X_train,X_test,y_train,y_test = train_test_split(
    faces.data,faces.target,test_size=0.25,random_state=0
    )
    evaluate_cross_validation(svc_1,X_train,y_train,5)
    train_and_evaluate(svc_1,X_train,X_test,y_train,y_test)

    2 sklearn.model_selection import train_test_split

    svc_2 = SVC(kernel='linear')

    经过筛选的target:target_glasses(分为0和1类) 原始的:faces.target

    X_train,X_test,y_train,y_test = train_test_split(
    faces.data,target_glasses,test_size=0.25,random_state=0
    )
    evaluate_cross_validation(svc_2,X_train,y_train,5)
    train_and_evaluate(svc_2,X_train,X_test,y_train,y_test)

    3

    def print_faces(images, target, top_n):
    # set up figure size in inches
    fig = plt.figure(figsize=(12, 12))
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
    for i in range(top_n):
    # we will print images in matrix 20x20
    p = fig.add_subplot(20, 20, i + 1, xticks=[], yticks=[])
    p.imshow(images[i], cmap=plt.cm.bone)
    # label the image with target value 加本文target[i]预测的标签
    p.text(0, 14, str(target[i]))
    p.text(0, 60, str(i))

    X_test = faces.data[30:40]
    y_test = target_glasses[30:40]
    print (y_test.shape[0])
    select = np.ones(target_glasses.shape[0])
    select[30:40] = 0
    X_train = faces.data[select == 1]
    y_train = target_glasses[select == 1]
    print (y_train.shape[0])

    svc_3 = SVC(kernel='linear')
    train_and_evaluate(svc_3, X_train, X_test, y_train, y_test)
    y_pred = svc_3.predict(X_test)
    eval_faces = [np.reshape(a, (64, 64)) for a in X_test]
    print_faces(eval_faces, y_pred, 10)`

    参考:http://www.scikitlearn.com.cn/0.21.3/30/

  • 相关阅读:
    写给理工科人看的乐理(一)声学基础
    魔方最少记忆还原法
    甲乙两人互猜数字(鬼谷子问题)的逻辑推理与算法建模
    模板元编程实现素数判定
    UVa OJ 194
    UVa OJ 175
    UVa OJ 197
    UVa OJ 180
    UVa OJ 140
    判断input或者div.span等标签是否存在
  • 原文地址:https://www.cnblogs.com/Towerb/p/14019346.html
Copyright © 2020-2023  润新知