• 关于KNN的python3实现


      关于KNN,有幸看到这篇文章,写的很好,这里就不在赘述。直接贴上代码了,有小的改动。(原来是python2版本的,这里改为python3的,主要就是print)

      环境:win7 32bit + spyder + anaconda3.5

      一、初阶

    # -*- coding: utf-8 -*-
    """
    Created on Sun Nov  6 16:09:00 2016
    
    @author: Administrator
    """
    
    #Input:
    #	newInput:待测的数据点(1xM)
    #	dataSet:已知的数据(NxM)
    #	labels:已知数据的标签(1xM)
    #	k:选取的最邻近数据点的个数
    #
    #Output:
    #	待测数据点的分类标签
    #	
    
    from numpy import *
    
    # creat a dataset which contain 4 samples with 2 class
    def createDataSet():
    	# creat a matrix: each row as a sample
    	group = array([[1.0, 0.9], [1.0, 1.0], [0.1, 0.2], [0.0, 0.1]])
    	labels = ['A', 'A', 'B', 'B']
    	return group, labels
    	
    
    #classify using KNN
    def KNNClassify(newInput, dataSet, labels, k):
    	numSamples = dataSet.shape[0]  # row number
    	# step1:calculate Euclidean distance
    	# tile(A, reps):Constract an array by repeating A reps times
    	diff = tile(newInput, (numSamples, 1)) - dataSet
    	squreDiff = diff**2
    	squreDist = sum(squreDiff, axis=1)  # sum if performed by row
    	distance = squreDist ** 0.5
    	
    	#step2:sort the distance
    	# argsort() returns the indices that would sort an array in a ascending order  
    	sortedDistIndices = argsort(distance)
    	
    	classCount = {}
    	for i in range(k):
    		# choose the min k distance
    		voteLabel = labels[sortedDistIndices[i]]
    		
    		#step4:count the times labels occur
    		# when the key voteLabel is not in dictionary classCount, 
    		# get() will return 0
    		classCount[voteLabel] = classCount.get(voteLabel, 0) + 1
    	#step5:the max vote class will return
    	maxCount = 0
    	for k, v in classCount.items():
    		if v > maxCount:
    			maxCount = v
    			maxIndex = k
    	
    	return maxIndex
    
    		
    # test
    
    dataSet, labels = createDataSet()
    
    testX = array([1.2, 1.0])
    k = 3
    outputLabel = KNNClassify(testX, dataSet, labels, 3)
    
    print("Your input is:", testX, "and classified to class: ", outputLabel)
    
    
    testX = array([0.1, 0.3])
    k = 3
    outputLabel = KNNClassify(testX, dataSet, labels, 3)
    
    print("Your input is:", testX, "and classified to class: ", outputLabel)
    

      运行结果:

      二、进阶

      用到的手写识别数据库资料在这里下载。关于资料的介绍在上面的博文也已经介绍的很清楚了。

    # -*- coding: utf-8 -*-
    """
    Created on Sun Nov  6 16:09:00 2016
    
    @author: Administrator
    """
    
    #Input:
    #	newInput:待测的数据点(1xM)
    #	dataSet:已知的数据(NxM)
    #	labels:已知数据的标签(1xM)
    #	k:选取的最邻近数据点的个数
    #
    #Output:
    #	待测数据点的分类标签
    #	
    
    from numpy import *
    
    
    
    #classify using KNN
    def KNNClassify(newInput, dataSet, labels, k):
    	numSamples = dataSet.shape[0]  # row number
    	# step1:calculate Euclidean distance
    	# tile(A, reps):Constract an array by repeating A reps times
    	diff = tile(newInput, (numSamples, 1)) - dataSet
    	squreDiff = diff**2
    	squreDist = sum(squreDiff, axis=1)  # sum if performed by row
    	distance = squreDist ** 0.5
    	
    	#step2:sort the distance
    	# argsort() returns the indices that would sort an array in a ascending order  
    	sortedDistIndices = argsort(distance)
    	
    	classCount = {}
    	for i in range(k):
    		# choose the min k distance
    		voteLabel = labels[sortedDistIndices[i]]
    		
    		#step4:count the times labels occur
    		# when the key voteLabel is not in dictionary classCount, 
    		# get() will return 0
    		classCount[voteLabel] = classCount.get(voteLabel, 0) + 1
    	#step5:the max vote class will return
    	maxCount = 0
    	for k, v in classCount.items():
    		if v > maxCount:
    			maxCount = v
    			maxIndex = k
    	
    	return maxIndex
    
    		
    
    # convert image to vector  
    def  img2vector(filename):  
        rows = 32  
        cols = 32  
        imgVector = zeros((1, rows * cols))   
        fileIn = open(filename)  
        for row in range(rows):  
            lineStr = fileIn.readline()  
            for col in range(cols):  
                imgVector[0, row * 32 + col] = int(lineStr[col])  
      
        return imgVector
    
    
    # load dataSet  
    def loadDataSet():  
        ## step 1: Getting training set  
        print("---Getting training set...") 
        dataSetDir = 'F:\Techonolgoy\算法学习\KNN\进阶\'  
        trainingFileList = os.listdir(dataSetDir + 'trainingDigits') # load the training set  
        numSamples = len(trainingFileList)  
      
        train_x = zeros((numSamples, 1024))  
        train_y = []  
        for i in range(numSamples):  
            filename = trainingFileList[i]  
      
            # get train_x  
            train_x[i, :] = img2vector(dataSetDir + 'trainingDigits/%s' % filename)   
      
            # get label from file name such as "1_18.txt"  
            label = int(filename.split('_')[0]) # return 1  
            train_y.append(label)  
      
        ## step 2: Getting testing set  
        print("---Getting testing set...")  
        testingFileList = os.listdir(dataSetDir + 'testDigits') # load the testing set  
        numSamples = len(testingFileList)  
        test_x = zeros((numSamples, 1024))  
        test_y = []  
        for i in range(numSamples):  
            filename = testingFileList[i]  
      
            # get train_x  
            test_x[i, :] = img2vector(dataSetDir + 'testDigits/%s' % filename)   
      
            # get label from file name such as "1_18.txt"  
            label = int(filename.split('_')[0]) # return 1  
            test_y.append(label)  
      
        return train_x, train_y, test_x, test_y  
      
    # test hand writing class  
    def testHandWritingClass():  
        ## step 1: load data  
        print("step 1: load data...") 
        train_x, train_y, test_x, test_y = loadDataSet()  
      
        ## step 2: training...  
        print("step 2: training...")  
        pass  
      
        ## step 3: testing  
        print("step 3: testing...")  
        numTestSamples = test_x.shape[0]  
        matchCount = 0  
        for i in range(numTestSamples):  
            predict = KNNClassify(test_x[i], train_x, train_y, 3)  
            if predict == test_y[i]:  
                matchCount += 1  
        accuracy = float(matchCount) / numTestSamples  
      
        ## step 4: show the result  
        print("step 4: show the result...")  
        print('The classify accuracy is: %.2f%%' % (accuracy * 100)) 
    
    
    
    testHandWritingClass()
    

      运行结果:

  • 相关阅读:
    Python学习第151天(Django之多对多)
    Python学习第150天(目前正在做的内容介绍)
    挑战日语学习100天:Day11
    挑战日语学习100天:Day10
    hdu3853 LOOPS 期望dp
    最长公共子串
    基于后缀数组的字符串匹配
    高度数组模板
    Jenkins持续集成自动化测试
    自动化上传文件
  • 原文地址:https://www.cnblogs.com/buzhizhitong/p/6036417.html
Copyright © 2020-2023  润新知