• k-Nearest Neighbors 实战2 使用kNN算法改进约会网站的配对结果


    本文《machine learning in action》学习笔记
    数据源码可以在这里获取 :https://www.manning.com/books/machine-learning-in-action


    这里Python 3+的code

    from numpy import *
    import matplotlib.pyplot as plt
    import operator
    
    def kNNClassify(inX, dataSet, labels, k):
        '''put the kNN classification algorithm into action'''
    
        dataSetSize = dataSet.shape[0]
        diffMax = tile(inX,(dataSetSize,1)) - dataSet
        sqDiffMax = diffMax ** 2
        sqDistances = sqDiffMax.sum(axis=1)
        distances = sqDistances**0.5
        # argsort 返回由大到小的索引值
        sortedDistIndicies = distances.argsort()
        classCount= {}
    
        for i in range(k):
            # 找到最大索引值对应数据的label
            voteIlabel = labels[sortedDistIndicies[i]]
             # returns a value for the given key
            classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
        # 按照键值的大小排列
        sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True)
        return sortedClassCount[0][0]
    
    
    
    def file2matrix(filename):
        """process the text information"""
        fr = open(filename)
        arrayofLines = fr.readlines()
        fr.close()
        numberofLines = len(arrayofLines)
        returnMat = zeros((numberofLines, 3))
        classLabelVector = []
    
        index = 0
        for line in arrayofLines:
            # Python strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符)
            line = line.strip()
            # split()通过指定分隔符对字符串进行切片,这里使用tab
            listFromLine = line.split('	')
            returnMat[index,:] = listFromLine[0:3]
            classLabelVector.append(int(listFromLine[-1]))
            index += 1
        return returnMat, classLabelVector
    
    
    def autonorm(dataset):
        """归一化"""
        # 求每一列最小值和最大值
        minvalue = dataset.min(0)
        maxvalue = dataset.max(0)
        ranges = maxvalue - minvalue
        # 使用shape获取dataset的shape
        normdataset = zeros(shape(dataset))
        # shape[0] 获取第一行元素个数
        m = dataset.shape[0]
        # 使用tile函数将变量内容复制成输入矩阵同样大小的矩阵做矩阵的减法
        normdataset = dataset - tile(minvalue, (m,1))
        # 做矩阵的除法
        normdataset = normdataset / tile(ranges, (m,1))
        return normdataset, ranges, minvalue
    
    def datingclasstest():
        horatio = 0.1
        datingdatamat, datinglabel = file2matrix("datingTestset2.txt")
        normat, ranges, minvalues = autonorm(datingdatamat)
        m = normat.shape[0]
        numtestvec = int(m*horatio)
        errorcount = 0.0
        for i in range(numtestvec):
            #classifierresult = classify0(normat[i,:], normat[numtestvec:m,:], datinglabel[numtestvec:m,:], 3)
            classifierresult = kNNClassify(normat[i, :], normat[numtestvec:m], datinglabel[numtestvec:m], 3)
            print("No.%d test data, the classifier came back with : %d, the real answeris: %d" %(i, classifierresult, datinglabel[i]))
            if (classifierresult != datinglabel[i]):
                errorcount += 1.0
        print ("the total error rate is: %f" % (errorcount/float(numtestvec)))
    
    
    
    if __name__ == '__main__':
        datingdatamat, datinglabel = file2matrix('datingTestSet2.txt')
        normdataset, ranges, minvalue = autonorm(datingdatamat)
        print(normdataset)
        print("ranges = ", ranges)
        print("minvalue = ", minvalue)
    
        fig = plt.figure()
        ax = fig.add_subplot(111)
        # 画散点图 scatter
        ax.scatter(datingdatamat[:,1], datingdatamat[:,2], 15.0*array(datinglabel), 15.0*array(datinglabel))
        plt.show()
    
        datingclasstest()
    
    
    
    
    
    
    
    
    
    

    读取TXT数据

    txt

    def file2matrix(filename):
        """process the text information"""
        fr = open(filename)
        arrayofLines = fr.readlines()
        numberofLines = len(arrayofLines)
        returnMat = zeros((numberofLines, 3))
        classLabelVector = []
    
        index = 0
        for line in arrayofLines:
            # Python strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符)
            line = line.strip()
            # split()通过指定分隔符对字符串进行切片,这里使用tab
            listFromLine = line.split('	')
            returnMat[index:1] = listFromLine[0:3]
            classLabelVector.append(int(listFromLine[-1]))
            index += 1
        return returnMat, classLabelVector

    txt
    normalization result:
    norm
    plot:
    plot

    test result:
    result
    结果与书中的结果并不一致,kNN这么简单的算法,其结果应该一致才对。为什么?

  • 相关阅读:
    jQuery 复选框全选反选
    JeeSite是基于多个优秀的开源项目,高度整合封装而成的高效,高性能,强安全性的 开源 Java EE快速开发平台
    SpringMVC+MyBatis(最新)
    基于Maven构建整合SpringMVC+Mybtis+Druid
    alibaba的FastJson(高性能JSON开发包)
    JAVA中使用JSON进行数据传递
    java 发送http json请求
    JDK中的URLConnection参数详解
    java调用Http请求 -HttpURLConnection学习
    Jquery调用webService的四种方法
  • 原文地址:https://www.cnblogs.com/siucaan/p/9623177.html
Copyright © 2020-2023  润新知