import numpy as np # listdir()列出给定目录的文件名 from os import listdir import operator # inX-分类的输入向量,dataSet-输入的训练样本集,labels-标签向量,k-近邻数 def classify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] # 得到训练集的行数,即样本个数 # 以下三行距离计算计算 # print("样本个数:",dataSetSize) diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet # tile():拉伸copy # 将输入的测试样本沿行方向扩充为(4,2),减去训练样本的坐标,得到各自的距离 # print("变形:",diffMat) sqDiffMat = diffMat ** 2 # 欧式距离平方 sqDistances = sqDiffMat.sum(axis=1) # 欧式距离求和 distances = sqDistances ** 0.5 # 开方 sortedDistIndicies = distances.argsort() # 按distances中的数值大小依次返回索引给y # print("索引顺序:",sortedDistIndicies) classCount = {} # 以下两行选择距离最小的k个点 for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 # 0:{labels[2]="B","B":1} 1:("B":2) 2:("A":1) # get():返回指定键的值,如果值不在字典中返回默认值(此处设为0) # 排序 # print("classCount.items():",classCount.items()) sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # operator.itemgetter(x):定义函数,获取对象的第x个域的值 # key为函数,指定取待排序元素的哪一项进行排序 # reverse = True 从大到小排序 # print("sortedClassCount:",sortedClassCount) return sortedClassCount[0][0] # 将图像转换为向量 def img2vector(filename): returnVect = np.zeros(1024) fr = open(filename) for i in range(32): lineStr = fr.readline() for j in range(32): returnVect[32 * i + j] = int(lineStr[j]) # 把逐行读取到的单行的每一项依次赋给 return returnVect def handwriteClassTest(): hwLabels = [] trainingFileList = listdir("trainingDigits") m = len(trainingFileList) # 文件个数 trainingMat = np.zeros((m, 1024)) # m个样本,每个样本1024个数据 for i in range(m): # 以下三行从文件名解析分类数字 fileNameStr = trainingFileList[i] # 获取文件名 fileStr = fileNameStr.split('.')[0] # 文件名去后缀 classNumStr = int(fileStr.split('_')[0]) # 获取文件数据所表示的值 hwLabels.append(classNumStr) # 用样本值替换全0数组 trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr) testFileList = listdir('testDigits') errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) vectorUnderTest = img2vector('testDigits/%s'% fileNameStr) classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)) if (classifierResult != classNumStr): errorCount += 1.0 print(" the total number of errors is: %d" % errorCount) print(" the total error rate is: %f" % (errorCount / float(mTest))) if __name__ == "__main__": handwriteClassTest()