1、kNN 算法
算法说明:
set<X1,X2……Xn> 为已知类别数据集,预测 点Xt 的类别:
(1)计算中的set中每一个点与Xt的距离
(2)按距离增序排列
(3)选择距离最小的前k个点
(4)确定前k个点所在的类别的出现频率
(5)返回频率最高的类别作为测试的结果
1 from numpy import * 2 import operator 3 def createDataSet(): 4 group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) 5 labels = ['A','A','B','B'] 6 return group, labels 7 8 #kNN 9 def classify0(inX , dataSet ,labels,k): 10 dataSetSize = dataSet.shape[0] #行数 11 diffMat = tile(inX,(dataSetSize,1)) - dataSet # tile(inX,(dataSetSize,1)) 生成 dataSetSize 行 1 列的 元素为 inX的 数组 12 sqDiffMat = diffMat ** 2 # ** 为 ^ 13 sqDistances = sqDiffMat.sum(axis=1) # axis=0是按列求和 axis=1 是按行求和 14 distance = sqDistances ** 0.5 15 sortedDisInd = distance.argsort()# argsort,属于numpy中的函数 返回排序后元素在原对象中的下标 16 classCount = {} 17 for i in range(k): 18 votelabel = labels[sortedDisInd[i]] 19 classCount[votelabel] = classCount.get(votelabel,0) + 1 #dict.get(key, default=None) key:key在字典中查找。 default:在key不存在的情况下返回值None。 20 sortedClassCount = sorted(classCount.iteritems(),key = operator.itemgetter(1),reverse =True) 21 ''' 22 要通过student的第三个域排序,可以这么写: 23 sorted(students, key=operator.itemgetter(2)) 24 sorted函数也可以进行多级排序,例如要根据第二个域和第三个域进行排序,可以这么写: 25 sorted(students, key=operator.itemgetter(1,2)) 26 即先跟句第二个域排序,再根据第三个域排序。 27 ''' 28 return sortedClassCount[0][0]
2、加载数据
下载地址:http://pan.baidu.com/s/1c0NeKCg
数据格式:[fre flier miles earned per year]' '[per of time spent playing video games]' '[liters of ice cream consumed per year]' '[1,means do not at all/2,means small do/3,means large do]
1 #加载数据 2 def file2matrix(filename): 3 fr = open(filename) 4 arrayOLines = fr.readlines() #注意需要加s 5 numberOfLines = len(arrayOLines) 6 returnMat = zeros((numberOfLines,3)) 7 classLabelVector = [] 8 index = 0 9 for line in arrayOLines: 10 line = line.strip() 11 listFormLine = line.split(' ') 12 for x in range(0,3): 13 returnMat[index,x] = float(listFormLine[x]) 14 classLabelVector.append(int(listFormLine[-1])) # -1 为最后一个元素 15 index += 1 16 return returnMat,classLabelVector
3、散点图
1 import matplotlib 2 import matplotlib.pyplot as plt 3 datingDataMat,datingLabels = kNN.file2matrix('datingTestSet.txt') 4 fig = plt.figure() #figure创建一个绘图对象 5 ax = fig.add_subplot(111)# 若参数为349,意思是:将画布分割成3行4列,图像画在从左到右从上到下的第9块, 6 7 ''' 8 matplotlib.pyplot.scatter(x, y, s=20, c='b', marker='o', cmap=None, norm=None, vmin=None, vmax=None, alpha=None, linewidths=None, verts=None, hold=None,**kwargs) 9 其中,xy是点的坐标,s点的大小 10 maker是形状可以maker=(5,1)5表示形状是5边型,1表示是星型(0表示多边形,2放射型,3圆形) 11 alpha表示透明度;facecolor=‘none’表示不填充。 12 ''' 13 14 ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),marker=(5,1),alpha=0.5) 15 plt.show()
4、归一化特征值
由于特征值的大小不同,所以就会对结果的影响程度不同。这就需要我们归一化特征值,把每个特征值的大小固定在[0,1]:
range = MaxVal - MinVal
normVal = rawVal / (MaxVal - MinVal)
1 #归一化特征值 2 def autoNorm(dataSet): 3 minVals = dataSet.min(0) 4 maxVals = dataSet.max(0) 5 ranges = maxVals - minVals 6 normDataSet = zeros(shape(dataSet)) 7 m = dataSet.shape[0] 8 normDataSet = dataSet - tile(minVals,(m,1)) 9 normDataSet = normDataSet / tile(ranges,(m,1)) 10 return normDataSet,ranges,minVals
5.分类器测试
用10%的数据作为输入来测试,另外90%作为已知集合
1 def datingClassTest(): 2 hoRatio = 0.10 3 datingDataMat,datingLabels = file2matrix('datingTestSet.txt') 4 normMat,ranges,minVals = autoNorm(datingDataMat) 5 m = normMat.shape[0] 6 numTestVecs = int(m * hoRatio) 7 errorCount = 0.0 8 for i in range(numTestVecs): 9 classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3) 10 print "back %d ,real %d" % (classifierResult,datingLabels[i]) 11 if(classifierResult != datingLabels[i]): 12 errorCount += 1.0 13 print "range is %f" % (errorCount / float(numTestVecs))
6、约会网站测试
1 #约会网站测试函数 2 def classifyPerson(): 3 resultList = ['not at all','in small doses','in large dose'] 4 percentTats = float(raw_input("per of time spent playing video games?")) 5 ffMiles = float(raw_input("fre flier miles earned per year?")) 6 iceCream = float(raw_input("liters of ice cream consumed per year?")) 7 datingDataMat,datingLabels = file2matrix('datingTestSet.txt') 8 normMat,ranges,minVals = autoNorm(datingDataMat) 9 inArr = array([ffMiles,percentTats,iceCream]) 10 classifierResult = classify0((inArr - minVals)/ranges,normMat,datingLabels,3) 11 print "You will probably like this person :", 12 print resultList[classifierResult-1]