• k-近邻算法python代码实现(非常全)


    1、k近邻算法是学习机器学习算法最为经典和简单的算法,它是机器学习算法入门最好的算法之一,可以非常好并且快速地理解机器学习的算法的框架与应用。它是一种经典简单的分类算法,当然也可以用来解决回归问题。
    2、kNN机器学习算法具有以下的特点:
    (1)思想极度简单
    (2)应用的数学知识非常少
    (3)解决相关问题的效果非常好
    (4)可以解释机器学习算法使用过程中的很多细节问题
    (5)更加完整地刻画机器学习应用的流程
    3、KNN算法pyhton代码实现如下:

    (1)解决分类问题的代码如下:
    #1-1输入任意的自定义数据集来进行相关的验证
    import numpy as np
    import matplotlib.pyplot as plt #导入相应的数据可视化模块
    raw_data_X=[[3.393533211,2.331273381],
    [3.110073483,1.781539638],
    [1.343808831,3.368360954],
    [3.582294042,4.679179110],
    [2.280362439,2.866990263],
    [7.423436942,4.696522875],
    [5.745051997,3.533989803],
    [9.172168622,2.511101045],
    [7.792783481,3.424088941],
    [7.939820817,0.791637231]]
    raw_data_Y=[0,0,0,0,0,1,1,1,1,1]
    print(raw_data_X)
    print(raw_data_Y)
    x_train=np.array(raw_data_X)
    y_train=np.array(raw_data_Y) #数据的预处理,需要将其先转换为矩阵,并且作为训练数据集
    print(x_train)
    print(y_train)
    plt.figure(1)
    plt.scatter(x_train[y_train==0,1],x_train[y_train==0,0],color="g")
    plt.scatter(x_train[y_train==1,0],x_train[y_train==1,1],color="r") #将其散点图输出
    x=np.array([8.093607318,3.365731514]) #定义一个新的点,需要判断它到底属于哪一类数据类型
    plt.scatter(x[0],x[1],color="b") #在算点图上输出这个散点,看它在整体散点图的分布情况
    #kNN机器算法的使用
    from math import sqrt
    distance=[]
    for x_train in x_train:
    d=sqrt(np.sum((x_train-x)**2))
    distance.append(d)
    print(distance)
    d1=np.argsort(distance) #输出distance排序的索引值
    print(d1)
    k=6
    n_k=[y_train[(d1[i])] for i in range(0,k)]
    print(n_k)
    from collections import Counter #导入Counter模块
    c=Counter(n_k).most_common(1)[0][0] #Counter模块用来输出一个列表中元素的个数,输出的形式为列表,其里面的元素为不同的元组
    #另外的话对于Counter模块它有.most_common(x)可以输出统计数字出现最多的前x个元组,其中元组的key是其元素值,后面的值是出现次数
    y_predict=c
    print(y_predict)
    plt.show() #输出点的个数
    #在scikitlearn中调用KNN算法的操作步骤
    from sklearn.neighbors import KNeighborsClassifier
    KNN_classifier=KNeighborsClassifier(n_neighbors=6)
    raw_data_X=[[3.393533211,2.331273381],
    [3.110073483,1.781539638],
    [1.343808831,3.368360954],
    [3.582294042,4.679179110],
    [2.280362439,2.866990263],
    [7.423436942,4.696522875],
    [5.745051997,3.533989803],
    [9.172168622,2.511101045],
    [7.792783481,3.424088941],
    [7.939820817,0.791637231]]
    raw_data_Y=[0,0,0,0,0,1,1,1,1,1]
    print(raw_data_X)
    print(raw_data_Y)
    x_train=np.array(raw_data_X)
    y_train=np.array(raw_data_Y)
    print(x_train)
    print(y_train)
    KNN_classifier.fit(x_train,y_train)
    print(x)
    x=x.reshape(1,-1)
    print(KNN_classifier.predict(x))
    test_data1=[[3.93533211,2.33127381],
    [3.10073483,1.78159638],
    [1.34808831,3.36830954],
    [3.58294042,4.67919110],
    [2.28032439,2.86690263],
    [7.42343942,4.69652875],
    [5.74505997,3.53399803],
    [9.17216622,2.51101045],
    [7.79278481,3.42488941],
    [7.93982087,0.79637231]]
    test_data=np.array(test_data1)
    test_target=[0,0,0,0,1,1,0,0,0,0]
    y_pred=KNN_classifier.predict(test_data)
    from sklearn import metrics #引入机器学习的验证模块
    print(metrics.accuracy_score(y_true=test_target,y_pred=y_pred)) #输出整体预测结果的准确率,其中第三个参数normalize=False表示输出结果预测正确的个数
    print(metrics.confusion_matrix(y_true=test_target,y_pred=y_pred)) #输出混淆矩阵,如果为对角阵,则表示预测结果是正确的,准确度越大


    #1-2利用scikitlearn自带的iris数据集进行相关的训练
    import numpy as np
    import pandas as pd
    #引入原始数据,进行数据的预处理
    from sklearn.datasets import load_iris #导入iris原始数据集合
    iris=load_iris()
    print(iris)
    print(len(iris["data"]))
    from sklearn.model_selection import train_test_split #引入数据训练与检验模块
    train_data,test_data, train_target, test_target=train_test_split(iris.data,iris.target,test_size=0.1,random_state=1)
    #建立数据的模型和相应的决策树结构
    from sklearn.neighbors import KNeighborsClassifier
    KNN_classifier=KNeighborsClassifier(n_neighbors=6)
    KNN_classifier.fit(train_data,train_target) #进行原始数据的训练
    y_pred=KNN_classifier.predict(test_data) #进行数据集的测试

    #数据验证
    from sklearn import metrics #引入机器学习的验证模块
    print(metrics.accuracy_score(y_true=test_target,y_pred=y_pred)) #输出整体预测结果的准确率,其中第三个参数normalize=False表示输出结果预测正确的个数
    print(metrics.confusion_matrix(y_true=test_target,y_pred=y_pred)) #输出混淆矩阵,如果为对角阵,则表示预测结果是正确的,准确度越大


    #1-3利用scikitlearn自带的手写字体digits数据集进行相关的训练
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import datasets
    import matplotlib
    digits=datasets.load_digits() #导入手写字体数据集
    print(digits.keys())
    x=digits.data
    print(x.shape)
    y=digits.target
    print(y.shape)
    print(y[:100])
    print(x[:10])
    x1=x[666].reshape(8,8)
    print(x1)
    plt.imshow(x1,cmap=matplotlib.cm.binary)
    plt.show()
    print(y[666])
    from sklearn.model_selection import train_test_split #引入数据训练与检验模块
    x_train,x_test, y_train, y_test=train_test_split(digits.data,digits.target,test_size=0.1,random_state=0)
    #建立数据的模型和相应的KNNs算法结构
    from sklearn.neighbors import KNeighborsClassifier
    KNN=KNeighborsClassifier(n_neighbors=3)
    KNN_classifier.fit(x_train,y_train) #进行原始数据的训练
    y_pred=KNN_classifier.predict(x_test) #进行数据集的测试
    print(y_pred)
    print(KNN_classifier.score(x_test,y_test)) #直接输出相应的准确度
    #1-5数据验证
    from sklearn import metrics #引入机器学习的验证模块
    print(metrics.accuracy_score(y_true=y_test,y_pred=y_pred)) #输出整体预测结果的准确率,其中第三个参数normalize=False表示输出结果预测正确的个数
    print(metrics.confusion_matrix(y_true=y_test,y_pred=y_pred)) #输出混淆矩阵,如果为对角阵,则表示预测结果是正确的,准确度越大
    from sklearn.model_selection import train_test_split #引入数据训练与检验模块
    x_train,x_test, y_train, y_test=train_test_split(digits.data,digits.target,test_size=0.2,random_state=0)
    #建立数据的模型和相应的KNNs算法结构

    #1-6对于KNN算法寻找最佳的超参数k的值以及另外一个超参数distances,以及在distance的情况下选择出最佳的超参数p的值的大小
    best_method=""
    best_score=0.0
    best_k=0
    s=[]
    from sklearn.neighbors import KNeighborsClassifier
    for method in ["uniform","distance"]:
    for k in range(1,11):
    KNN=KNeighborsClassifier(n_neighbors=k,weights=method)
    KNN.fit(x_train,y_train) #进行原始数据的训练
    score=KNN.score(x_test,y_test) #直接输出相应的准确度
    s.append(score)
    if score>best_score:
    best_score=score
    best_k=k
    best_method=method
    #数据验证
    print("best_method=",best_method)
    print("best_k=",best_k)
    print("best_score=",best_score)
    plt.figure(2)
    x=[i for i in range(1,21)]
    plt.plot(x,s,"r")
    plt.show()

    best_p=0
    best_score=0.0
    best_k=0
    s=[]
    from sklearn.neighbors import KNeighborsClassifier
    for k in range(1,11):
    for p in range(1,6):
    KNN=KNeighborsClassifier(n_neighbors=k,weights="distance",p=p)
    KNN.fit(x_train,y_train) #进行原始数据的训练
    score=KNN.score(x_test,y_test) #直接输出相应的准确度
    s.append(score)
    if score>best_score:
    best_score=score
    best_k=k
    best_p=p
    #数据验证
    print("best_p=",best_p)
    print("best_k=",best_k)
    print("best_score=",best_score)
    plt.figure(2)
    s1=[]
    x=[i for i in range(1,6)]
    for i in range(1,11):
    s1=s[(i*5-5):(5*i)]
    plt.plot(x,s1,label=i)
    plt.legend(loc=2)
    plt.show()

    #1-7使用scikitlearn中的gridsearch来进行机器学习算法的超参数的最佳网格搜索方式
    param_grid=[{
    "weights":["uniform"],
    "n_neighbors":[i for i in range(1,11)]
    },
    {"weights":["distance"],
    "n_neighbors":[i for i in range(1,11)],
    "p":[i for i in range(1,6)]
    }
    ] #定义机器学习算法的不同超参数组合,使用字典的方式,二对于具体的超参数采用列表的数据结构
    knn_clf=KNeighborsClassifier()
    from sklearn.model_selection import GridSearchCV
    grid_search=GridSearchCV(knn_clf,param_grid,n_jobs=-1,verbose=2)
    grid_search.fit(x_train,y_train)
    print(grid_search.best_estimator_)
    print(grid_search.best_params_)
    print(grid_search.best_score_)


    #1-8 Scaler数据归一化处理
    import numpy as np
    from sklearn import datasets
    iris=datasets.load_iris()
    x=iris.data
    y=iris.target
    print(x[:10])
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=666)

    #1-8-1对于x_train利用均值方差进行归一化处理
    from sklearn.preprocessing import StandardScaler
    standardscaler=StandardScaler()
    standardscaler.fit(x_train)
    print(standardscaler.mean_) #平均值向量
    print(standardscaler.scale_) #标准差向量
    print(standardscaler.transform(x_train))
    x_train=standardscaler.transform(x_train)
    print(x_train)
    x_test_standard=standardscaler.transform(x_test)
    from sklearn.neighbors import KNeighborsClassifier
    knn=KNeighborsClassifier(n_neighbors=3)
    knn.fit(x_train,y_train)
    print(knn.score(x_test_standard,y_test))

    #1-8-2对于x_train利用均值归一化进行归一化处理
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=666)
    from sklearn.preprocessing import MinMaxScaler
    standardscaler1=MinMaxScaler()
    standardscaler1.fit(x_train)
    x_train=standardscaler1.transform(x_train)
    print(x_train)
    x_test_standard1=standardscaler1.transform(x_test)
    from sklearn.neighbors import KNeighborsClassifier
    knn=KNeighborsClassifier(n_neighbors=3)
    knn.fit(x_train,y_train)
    print(x_test_standard1)
    print(knn.score(x_test_standard1,y_test))
    (2)解决回归问题的代码如下:
    #1-1使用KNN算法的回归算法对数据进行训练和预测
    import numpy as np
    import matplotlib.pyplot as plt #导入相应的数据可视化模块
    from sklearn import datasets
    d=datasets.load_boston()
    print(d.data)
    print(d.DESCR)
    print(d.feature_names)
    print(d.data[:,5])
    x=d.data[d.target<50]
    y=d.target[d.target<50]
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=666)
    from sklearn.neighbors import KNeighborsRegressor
    knn=KNeighborsRegressor()
    knn.fit(x_train,y_train)
    y_pre=knn.predict(x_test)
    print(knn.score(x_test,y_test))
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import r2_score #直接调用库函数进行输出R2
    print(mean_squared_error(y_test,y_pre))
    print(mean_absolute_error(y_test,y_pre))
    #1-2利用网格搜搜寻找最优超参数组合
    param=[{"n_neighbors":[i for i in range(1,11)],
    "weights":["uniform"],
    },
    {
    "weights":["distance"],
    "n_neighbors":[i for i in range(1,11)],
    "p":[j for j in range(1,6)]
    }
    ]
    from sklearn.model_selection import GridSearchCV #利用网格搜索的方法对KNN算法求取最佳的超参数组合
    knn1=KNeighborsRegressor()
    grid1=GridSearchCV(knn1,param,n_jobs=-1,verbose=2)
    grid1.fit(x_train,y_train)
    print(grid1.best_params_)
    print(grid1.best_estimator_)
    print(grid1.best_estimator_.score(x_test,y_test))
    k=grid1.best_estimator_
    print(k.predict(x_test))
    print(k.score(x_test,y_test))
    最终实现效果如下所示;

    
    
     









  • 相关阅读:
    存储过程的优缺点?
    springboot注解
    springboot处理事务
    索引面试问题
    剑指Offer编程题(Java实现)——替换空格
    剑指Offer编程题(Java实现)——二维数组中的查找
    Java集合概括总结及集合选用
    Java中获取大小:length、length()、size()
    解决MarkDown打开出现:awesomium web-brower framework This view has crashed
    Java集合:Collection、List、Set、Map、泛型
  • 原文地址:https://www.cnblogs.com/Yanjy-OnlyOne/p/11302685.html
Copyright © 2020-2023  润新知