注释写得很清楚了,熟悉了一下python的一些基本语法和numpy中的一些操作。
1 from numpy import * 2 import operator 3 4 def createDataSet(): 5 # generate the samples and labels. 6 group = array([[1.0,1.1], [1.0,1.0], [0,0], [0,0.1]]) 7 labels = ['A', 'A', 'B', 'B'] 8 print group 9 return group, labels 10 11 def classify(inX, dataSet, labels, k): 12 dataSetSize = dataSet.shape[0] # get the size of one dimension. 13 # calculate the distance between inX and samples. 14 diffMat = tile(inX, (dataSetSize, 1)) - dataSet # repeat inX to generate a dataSetSize * 1 matrix. Then subtract the corresponding number in dataSet. 15 sqDiffMat = diffMat ** 2 # get the square of each D-value. 16 sqDistances = sqDiffMat.sum(axis=1) # get the sum of each pair of numbers. 17 distances = sqDistances ** 0.5 # get the square root of each sum. Those are distances between inX and samples. 18 19 sortedDistIndicies = distances.argsort() # return the index if 'distances' is sorted. 20 classCount = {} # make a directory {label:display times}. 21 for i in range(k): # get first kth nearest samples. 22 voteIlabel = labels[sortedDistIndicies[i]] # get the ith's label. 23 classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 # count the number of this label. 24 sortedClassCount = sorted(classCount.iteritems(), # get the most frequent label. 25 key=operator.itemgetter(1), reverse=True) 26 return sortedClassCount[0][0] # return the most frequent label. 27 28 dataSet, labels = createDataSet() 29 print classify([-100.0,-100.1], dataSet, labels, 1)