#KNN算法的Python实现 ''' KNN算法介绍 首先KNN分类是给定一个样本X,然后去数据集找X的K个近邻,根据K个近邻 的分类值,选择类别最多的类别作为样本X的类别 ''' #定义数据集 import numpy as np import operator import matplotlib import matplotlib.pyplot as plt from os import listdir def create_sample(): group = np.array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) label = ['A','A','B','B'] return group,label #确定样本X的类别 ''' x样本 label分类结果 group数据集 K代表KNN中K ''' def classify(x,label,group,K): groupSize = group.shape[0] #扩展X的行 temp = np.tile(x,(groupSize,1))-group #计算欧氏距离((x0-x1)^2 + (y0-y1)^2 )^(1/2) sqDistances = (temp**2).sum(axis=1) distances = sqDistances**0.5 #排序,将值从小到大排列,返回索引 sortedDistIndicies = distances.argsort() cu={} #求与距离X最近的k个点的label统计 for i in range(K): cur = label[sortedDistIndicies[i]] #获取对应label号 cu[cur] = cu.get(cur,0)+1 #print (cu) # 对字典排序,按value值降序排列(返回值为列表,元素为元组) sortedcu = sorted(cu.items(),key=lambda c:c[1],reverse=True) #print (sortedcu) return sortedcu[0][0] #p,q=create_sample() #print (classify([0,0],q,p,3)) ''' 返回一个文件中的属性集returnMat和类标号classLabelVector ''' def file2Matrix(filename): fr = open(filename) numberOfLines = len(fr.readlines()) #print ('in file2matrix,numberOfLines: ', numberOfLines) returnMat = np.zeros((numberOfLines,3)) #存储属性集 classLabelVector = [] #类标号 index = 0 fr = open(filename) for line in fr.readlines(): line = line.strip() listFromLine = line.split(' ') # 获取每行的元素列表,元素用 分开 returnMat[index,:] = [x for x in listFromLine[0:3]] #print(listFromLine) if (listFromLine[-1] == 'largeDoses'): classLabelVector.append(3) elif (listFromLine[-1] == 'smallDoses'): classLabelVector.append(2) elif (listFromLine[-1] == 'didntLike'): classLabelVector.append(1) elif (listFromLine[-1] == 3): classLabelVector.append(3) elif (listFromLine[-1] == 2): classLabelVector.append(2) elif (listFromLine[-1] == 1): classLabelVector.append(1) index+=1 return returnMat,classLabelVector ''' 属性值差距较大,为了减少值太大的属性对值小的属性的影响,分类之前还需要进行归一化。 归一化方程: (data-min_val)/(max_val-min_val) ''' def autoNorm(dataSet): min_val = dataSet.min(0) #0代表行 max_val = dataSet.max(0) ranges = max_val - min_val #print (dataSet.shape[0], dataSet.shape[1]) normDataSet = np.zeros(dataSet.shape) normDataSet = dataSet - np.tile(min_val, (dataSet.shape[0], 1)) return normDataSet/ranges,ranges #returnMat, classLabelVector = file2Matrix("tt.txt") #print (returnMat.shape) #min,ranges=autoNorm(returnMat) #print(min) ''' 对文件中的所有输入向量进行分类 一半作为训练集 一半作为验证集 ''' def datingClassTest(K=4): ratio = 0.5 returnMat, classLabelVector = file2Matrix("tt.txt") autoMat, ranges = autoNorm(returnMat) index = autoMat.shape[0]*ratio errorCount = 0 for i in range(int(index)): result = classify(autoMat[i,:],classLabelVector[int(index):autoMat.shape[0]],autoMat[int(index):autoMat.shape[0],:],K) print(result,classLabelVector[i]) if(result != classLabelVector[i]): errorCount+=1 print ("the total error rate is: %f" % (errorCount/float(index))) print (errorCount) datingClassTest() #测试程序
最后运行程序得到如下结果:
the total error rate is: 0.400000
2
其中,tt.txt文件如下:
40920 8.326976 0.953952 largeDoses 14488 7.153469 1.673904 smallDoses 26052 1.441871 0.805124 didntLike 75136 13.147394 0.428964 didntLike 40910 8.326376 0.953952 largeDoses 14488 7.153469 1.673904 smallDoses 26052 1.441271 0.805154 didntLike 41920 8.326976 0.653952 largeDoses 14488 7.11469 1.673904 smallDoses 25052 1.441771 0.805124 didntLike