• KNN算法的实现


    KNN算法是机器学习经典十大算法之一,简单易懂。这里给出KNN的实现,由两个版本:

    1.机器学习实战上作者的实现版本,我自己又敲了一遍感觉还是蛮有收获的;

    2.用自己的理解的一个实现,主要的区别就是效率没有第一个高,因为第一个大量使用矩阵向量的运算,速度比较快,还有就是作者的代码比较简介好看。自己的代码就是比较好懂。

    1.《机器学习实战》的代码

      1 # -*- coding: utf-8 -*-
      2 
      3 '''
      4  function: 根据《实战》实现KNN算法,用于约会网站进行匹配
      5  date: 2017.8.5
      6 '''
      7 
      8 from numpy import *
      9 import matplotlib.pyplot as plt
     10 import operator
     11 
     12 #产生数据集和标签
     13 def createDataSet():
     14     group = array([[1.0,1.1], [1.0,1.0], [0,0], [0,0.1]])
     15     labels = ['A', 'A', 'B', 'B']
     16     return group, labels
     17 
     18 #knn分类算法
     19 def classify0(inX, dataSet, labels, k):
     20     dataSetSize = dataSet.shape[0] #返回m,n,shape[0] == m 行数,代表由几个训练数据
     21 
     22     #计算距离;将测试向量 inX 在咧方向上重复一次,行方向重复m次,与对应的训练数据相减
     23     diffMat = tile(inX, (dataSetSize, 1)) - dataSet 
     24 
     25     sqDiffMat = diffMat**2 #平方的意思,不能用^
     26     sqDistances = sqDiffMat.sum(axis=1) #axis=1表示按行相加,axis=0表示案列相加
     27     distances = sqDistances**0.5 #测试点距离所有训练点的距离向量集合
     28 
     29     sortedDistIndices = distances.argsort() #距离从小到大排序,返回的是排序后的下表组成的数组;
     30     classCount = {}
     31 
     32     for i in range(k):
     33         voteIlabel = labels[sortedDistIndices[i]]
     34         classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #get函数返回value,若不存在则返回0
     35     
     36     #items()返回key,value,itemgetter(1),按照第二个元素进行排序,从大到小
     37     sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1),reverse=True)
     38 
     39     return sortedClassCount[0][0]
     40 
     41 #将文本记录转换为nmpy的解析程序
     42 def file2matrix(filename):
     43     fr = open(filename)
     44     arrayOLines = fr.readlines()
     45     numberOfLines = len(arrayOLines)
     46     returnMat = zeros((numberOfLines,3)) #返回的矩阵 是: m*3,3个特征值
     47     classLabelVector = []
     48     index = 0
     49     for line in arrayOLines:
     50         line = line.strip() #去掉换行符
     51         listFromLine = line.split('	') #将每一行用tab分割
     52         returnMat[index,:] = listFromLine[0:3] #每行前三个是特征
     53         classLabelVector.append(int(listFromLine[-1])) #最后一列是标签,并且告诉list是int类型
     54         index += 1
     55     return returnMat, classLabelVector
     56 
     57 #draw the point
     58 def drawPoint(datingDatamat, datingLabels):
     59     fig = plt.figure()
     60     ax = fig.add_subplot(111)
     61 
     62     #总共三个特征,可以选择其中任意两个来画图,看哪个区分效果好
     63     ax.scatter(datingDatamat[:,0],datingDatamat[:,1],15.0*array(datingLabels),15.0*array(datingLabels))
     64     #ax.scatter(datingDatamat[:,1],datingDatamat[:,2],15.0*array(datingLabels),15.0*array(datingLabels))
     65     plt.show()
     66 
     67 
     68 #对数据集归一化处理 autoNormanization
     69 def autoNorm(dataSet):
     70     minVals = dataSet.min(0) #get the minimal row
     71     maxVals = dataSet.max(0) #get the maximal row
     72     ranges = maxVals - minVals + ones_like(maxVals) #个人感觉这里可能由除〇的风险,加一效果可能好一点
     73     normDataSet = zeros(shape(dataSet)) #the same size of original dataset: m*n
     74     m = dataSet.shape[0]
     75     normDataSet = dataSet - tile(minVals, (m,1)) #repeat the minimal row m times on the row
     76     normDataSet = dataSet / tile(ranges, (m,1))
     77     return normDataSet, ranges, minVals
     78 
     79 #分类器针对约会网站的测试代码
     80 def datingClassTest():
     81     haRatio = 0.10
     82     datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
     83     normMat, ranges, minVals = autoNorm(datingDataMat)
     84     m = normMat.shape[0] #获得矩阵的行数,数据集的个数
     85     numTestVecs = int(m*haRatio) #用十分之一的数据来做测试,剩下的作为训练数据
     86     errorCount = 0.0
     87     for i in range(numTestVecs): #对每个测试用例开始计算
     88         #0~numtestvecs都是用来测试的,每次测试一行,从numtestvecs~m都是训练数据
     89         classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
     90         print('the classifier came back with : %d, the real answer is : %d'
     91          % (classifierResult,datingLabels[i]))
     92         if(classifierResult != datingLabels[i]):
     93             errorCount += 1
     94     print('the total error rate is: %f' % (errorCount / float(numTestVecs)))
     95 
     96 
     97 #让用户输入自己的一些信息,用输入的信息进行分类
     98 def classifyPerson():
     99     resultList = ['not at all', 'in small doses', 'in large doses']
    100     percentTats = float(input('persentage of time spent playing video games?'))
    101     ffMiles = float(input('frequent fliers miles each year?'))
    102     iceCream = float(input('liters of ice cream consumed per year?'))
    103     datingDatamat, datingLabels = file2matrix('datingTestSet2.txt')
    104     normMat, ranges, minVals = autoNorm(datingDatamat)
    105     inArr = array([percentTats, ffMiles, iceCream])
    106     #对输入的一个测试数据进行归一化然后分类
    107     classifierResult = classify0((inArr - minVals) / ranges, normMat, datingLabels, 3)
    108     print('you will probably like this person: ', resultList[classifierResult - 1])
    109     print(classifierResult)
    110     
    111 # #测试分类效果和加载数据的程序
    112 # group, labels = createDataSet()
    113 # result = classify0([0,0], group, labels, 3)
    114 # print(result)
    115 
    116 # #get numpy metrix from file
    117 # returnMat, classLabelVector = file2matrix('datingTestSet2.txt')
    118 # print(returnMat)
    119 # print(classLabelVector)
    120 
    121 # #auto normanization
    122 # normDataSet, ranges, minVals = autoNorm(returnMat)
    123 
    124 # #测试我们的分类器
    125 # datingClassTest()
    126 
    127 #请求用户输入几个特征,然后进行分类,判断喜爱程度
    128 classifyPerson()
    129 
    130 #draw
    131 #drawPoint(normDataSet, classLabelVector)

    2.自己的实现

      1 # -*- coding: utf-8 -*-
      2 
      3 '''
      4 function: 使用二维数据点集(x,y)来实现knn算法核心功能
      5 note: 计算欧式距离还可以用 numpy.linalg.norm(array1 - array2) 必须是array
      6 date: 2017.07.23
      7 '''
      8 from numpy import *
      9 from random import *
     10 import matplotlib.pyplot as plt
     11 
     12 #创建用来训练的数据集和用来测试的数据集,及标注信息,先用100个点训练,20个点测试
     13 def createData():
     14     trainingList0 = [[(1.0,1.0) for j in range(1)] for i in range(50)]
     15     trainingList1 = [[(1.0,1.0) for j in range(1)] for i in range(50)]
     16     trainingLabel,trainingLabe0 = [], []
     17     for i in range(50):
     18         x = randint(1,10)   #产生1,10之间的随机浮点数,类别0
     19         y = randint(1,10)
     20         trainingList0[i] = [(x,y)]
     21         trainingLabel.append(0) #属于类别0
     22     for j in range(50):
     23         x = randint(10,20) #产生十位数,10-20,类别1
     24         y = randint(10,20)
     25         trainingList1[j] = [(x,y)]
     26         trainingLabel.append(1)   #属于类别1
     27     trainingList = trainingList0 + trainingList1
     28 
     29     #产生测试的数据集
     30     testList0 = [[(1.0,1.0) for j in range(1)] for i in range(10)]
     31     testList1 = [[(1.0,1.0) for j in range(1)] for i in range(10)]
     32     testLabel = []
     33     for i in range(10):
     34         x = randint(4,9)
     35         y = randint(2,10)
     36         testList0[i] = [(x,y)]
     37         testLabel.append(0)
     38     for j in range(10):
     39         x = randint(11,19)
     40         y = randint(10,18)
     41         testList1[j] = [(x,y)]
     42         testLabel.append(1)
     43     testList = testList0 + testList1
     44     print(trainingList)
     45     return trainingList, trainingLabel, testList, testLabel
     46 
     47 #对测试数据集和训练数据集之间的距离进行计算,采用欧式距离 d = ((x1-x2)^2 + (y1-y2)^2)^0.5
     48 def calculateKNN(trainingList, trainingLabel, testList, testLabel, k):
     49     #20行,100列距离,每行代表一个测试点距离100个训练点的距离,初始化为0
     50     d = [[0 for j in range(100)] for i in range(20)] 
     51     #kPointDisList = [1000] * k #初始化k个最近距离为1000
     52     for i in range(20):
     53         for j in range(100):
     54             d[i][j] = getDistance(testList[i], trainingList[j]) #计算距离
     55             #这里判断求得的距离是否是最小的K个之一,若是则记录下j,更新此测试点的标签类别
     56             #kPointDisList = updateKPoint(d[i][j], kPointDisList)
     57     testLable = getKMin(d,k,trainingLabel)
     58     print(testLabel)
     59     return testLabel
     60 
     61 #计算列表里面的K个最小值,找出对应的标签类别号
     62 def getKMin(dList,k,trainingLabel):
     63     testLabel = []
     64     sortedIndexList = argsort(dList)  #返回dLlist从小到大排列的索引的数组
     65     print(sortedIndexList) 
     66     print(len(sortedIndexList))
     67     for x in range(20):
     68         type1 = 0 
     69         type0 = 0
     70         for i in range(k):  #计算最近的K个点,按照类别的多少进行投票表决
     71             if trainingLabel[sortedIndexList[x][i]] == 1:
     72                 type1 += 1
     73             else:
     74                 type0 += 1
     75         if type1 > type0:
     76             testLabel.append(1)
     77         else:
     78             testLabel.append(0)
     79     return testLabel
     80 
     81 #跟新最近的K个点坐标,标签
     82 def updateKPoint(d, kPointDisList):
     83     if d < max(kPointDisList):
     84         kPointDisList[kPointDisList.index(max(kPointDisList))] = d
     85     else:
     86         pass
     87     return kPointDisList
     88 
     89 #计算两个点的欧式距离,因为有float所以只能用**
     90 def getDistance(a, b):
     91     return ((a[0][0] - b[0][0])**2 + (a[0][1] - b[0][1])**2)**0.5
     92 
     93 #计算KNN分类结果的正确率
     94 def getCorrectRate(testLabel, resultLabel):
     95     correctNum = 0
     96     for i in range(len(testLabel)):
     97         if testLabel[i] == resultLabel[i]:
     98             correctNum += 1
     99     return correctNum / float(len(testLabel))
    100 
    101 #把分类结果用图形画出来
    102 def drawKNN(trainingList, testList):
    103     #产生画图数据
    104     x1,x2,y1,y2 = [],[],[],[]
    105     for i in range(len(trainingList)):
    106         x1.append(trainingList[i][0][0])
    107         y1.append(trainingList[i][0][1])
    108     for i in range(len(testList)):
    109         x2.append(testList[i][0][0])
    110         y2.append(testList[i][0][1])
    111     #创建一个绘图对象
    112     fig = plt.figure()
    113     ax1 = fig.add_subplot(111)
    114     #添加坐标图修饰 
    115     plt.xlabel('times(m)')
    116     plt.ylabel('money(y)')
    117     plt.title('first pic')
    118     #画散点图,maker代表散点图形状,c代表颜色
    119     pl1 = ax1.scatter(x1,y1,c = 'r',marker = '.')
    120     pl2 = ax1.scatter(x2,y2,c = 'b',marker = '+')
    121     #设置图标
    122     plt.legend([pl1,pl2],('train','test'))
    123     #显示图形
    124     plt.show()
    125     #保存图形到本地
    126     plt.savefig('test.png')
    127 #系统的总体控制逻辑
    128 def testKNN():
    129     #设定系统的K值
    130     k = 3
    131     trainingList, trainingLabel, testList, testLabel = createData()
    132     resultLabel = calculateKNN(trainingList, trainingLabel, testList, testLabel, k)
    133     correctRate = getCorrectRate(testLabel, resultLabel)
    134     print('correctRate is  =  ' + str(correctRate))
    135     drawKNN(trainingList, testList)
    136 
    137 testKNN()
    138 #drawKNN()
  • 相关阅读:
    P6007 [USACO20JAN]Springboards G
    CF1000F One Occurrence
    P6100 [USACO19FEB]Painting the Barn G
    P5838 [USACO19DEC]Milk Visits G
    P4085 [USACO17DEC]Haybale Feast
    P4267 [USACO18FEB]Taming the Herd
    P1712 [NOI2016]区间
    CF786B Legacy
    P5665 划分
    NOI2020网上同步赛 游记
  • 原文地址:https://www.cnblogs.com/robin2ML/p/7297474.html
Copyright © 2020-2023  润新知