• 机器学习(K邻近算法)


    算法描述

    K邻近算法采用测量不同特征值之间的距离方法进行分类

    工作原理

    存在一个样本数据集合,也称作训练样本集,并且样本集中每个数据都存在标签,即我们知道样本集中每一数据与所属分类的对应关系。输入没有标签的新数据后,将新数据的每个特征与样本集中数据对应的特征进行比较

    然后算法提取样本集中最相似的数据(最邻近)的分类标签。一般来说,我们只选择样本数据集中前K个最相似的数据,这就是K-邻近算法中K的出处,通常K是不大于20的整数。最后,选择K个最相似的数据中出现次数最多

    的分类,作为新数据的分类

    算法的类别

    该算法属于监督学习,用于分类,因而其目标变量是分散的

    优点

    对异常数据值不敏感,精度高,无数据输入设定

    缺点

    空间计算复杂度高

    算法的一般流程

    收集数据

    准备数据

    分析数据

    训练算法

    测试算法

    使用算法

    KNN算法实现代码

    from numpy import *
    import operator
    from os import listdir
    import matplotlib
    import matplotlib.pyplot as plt
    
    def classify0(inX, dataSet, labels, k):
        dataSetSize = dataSet.shape[0]
        diffMat = tile(inX, (dataSetSize,1)) - dataSet
        sqDiffMat = diffMat**2
        sqDistances = sqDiffMat.sum(axis=1)
        distances = sqDistances**0.5
        sortedDistIndicies = distances.argsort()     
        classCount={}          
        for i in range(k):
            voteIlabel = labels[sortedDistIndicies[i]]
            #对类计数
            classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
        sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
        #选取出现次数最多的
        return sortedClassCount[0][0]
    
    
    def createDataSet():
        group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
        labels = ['A','A','B','B']
        return group, labels
    
    
    
    
    def file2matrix(filename):
        fr = open(filename)
        numberOfLines = len(fr.readlines())         #get the number of lines in the file
        returnMat = zeros((numberOfLines,3))        #prepare matrix to return
        classLabelVector = []                       #prepare labels return   
        fr = open(filename)
        index = 0
        for line in fr.readlines():
            line = line.strip()
            listFromLine = line.split('	')
            returnMat[index,:] = listFromLine[0:3]
            classLabelVector.append(int(listFromLine[-1]))
            index += 1
        return returnMat,classLabelVector
    
       
    
    def autoNorm(dataSet):
        minVals = dataSet.min(0)
        maxVals = dataSet.max(0)
        ranges = maxVals - minVals
        normDataSet = zeros(shape(dataSet))
        m = dataSet.shape[0]
        normDataSet = dataSet - tile(minVals, (m,1))
        normDataSet = normDataSet/tile(ranges, (m,1))   #element wise divide
        return normDataSet, ranges, minVals
    
    def datingClassTest():
        hoRatio = 0.50      #hold out 10%
        datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')       #load data setfrom file
        normMat, ranges, minVals = autoNorm(datingDataMat)
        m = normMat.shape[0]
        numTestVecs = int(m*hoRatio)
        errorCount = 0.0
        for i in range(numTestVecs):
            classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
            print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
            if (classifierResult != datingLabels[i]): errorCount += 1.0
        print "the total error rate is: %f" % (errorCount/float(numTestVecs))
        print errorCount
    
    
     
    
    def img2vector(filename):
        returnVect = zeros((1,1024))
        fr = open(filename)
        for i in range(32):
            lineStr = fr.readline()
            for j in range(32):
                returnVect[0,32*i+j] = int(lineStr[j])
        return returnVect
    
    
    def handwritingClassTest():
        hwLabels = []
        trainingFileList = listdir('trainingDigits')           #load the training set
        m = len(trainingFileList)
        trainingMat = zeros((m,1024))
        for i in range(m):
            #读取图片文件名 0_1.txt
            fileNameStr = trainingFileList[i]
            fileStr = fileNameStr.split('.')[0]     #take off .txt
            classNumStr = int(fileStr.split('_')[0])
            #标记标签
            hwLabels.append(classNumStr)
            trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
        #测试模型
        testFileList = listdir('testDigits')        #iterate through the test set
        errorCount = 0.0
        mTest = len(testFileList)
        for i in range(mTest):
            fileNameStr = testFileList[i]
            fileStr = fileNameStr.split('.')[0]     #take off .txt
            classNumStr = int(fileStr.split('_')[0])
            vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
            classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
            print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
            if (classifierResult != classNumStr): errorCount += 1.0
        print "
    the total number of errors is: %d" % errorCount
        print "
    the total error rate is: %f" % (errorCount/float(mTest))
    
      
    if __name__ == "__main__":
    #    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
    #    ranges= autoNorm(datingDataMat)
    #    fig = plt.figure()
    #    ax = fig.add_subplot(111)
    #    ax.scatter(datingDataMat[:,1],datingDataMat[:,2],s = 15.0*array(datingLabels),c = array(datingLabels))
    #    ax.scatter(datingDataMat[:,1],datingDataMat[:,2],c = 'r')
    #    dataMatX = array([[1,2,3],[4,5,6],[7,8,9]])
    #    dataMatY = array([[2,4,6],[8,10,12],[14,16,18]])
    #    big = array([11,21,31])
    #    ax.scatter(dataMatX[:,1],dataMatY[:,1],s = 15.0*big,c = big)
    #    plt.show()
    #    print array(datingLabels)
    #    print datingDataMat
         handwritingClassTest()
  • 相关阅读:
    Hadoop eclipse插件使用过程中出现的问题
    hadoop eclipse插件下载 1.1.2版本
    Hbase 行键设计
    Js权限判断处理
    ASP.NET MVC中给所有的cshtml页面引用命名空间
    jquery实现的网页选项卡(拾忆)
    angularjs 请求后端接口请求了两次
    Entity Framework Code First关系映射约定
    Angularjs中link函数参数含义小节
    浅谈AngularJS中的$parse和$eval
  • 原文地址:https://www.cnblogs.com/xzm123/p/8980604.html
Copyright © 2020-2023  润新知