• 机器学习实战第二章----KNN


    1. tile的使用方法
      tile(A,n)的功能是把A数组重复n次(可以在列方向,也可以在行方向)
    2. argsort()函数
      argsort()函数返回的是数组中值从大到小的索引值
    3. dict.get()函数
      语法:dict.get(key, default=None)
      key----字典中要查找的键
      default----如果指定的简直不存在时,返回该默认值
    4. add_subplot()基础用法
    import matplotlib.pyplot as plt
    from numpy import *
    fig = plt.figure()
    ax = fig.add_subplot(349)
    ax.plot(x,y)
    

    将画布分成三行四列,在第九个分区画图

    KNN实例代码

    from numpy import *
    from os import listdir
    import operator
    
    
    def createDataSet():
        group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
        labels = ['A', 'A', 'B', 'B']
        return group, labels
    
    
    # 对数据进行分类
    def classify0(inX, dataSet, labels, k):
        dataSetSize = dataSet.shape[0]          #shape[0]获取第一维的数目
        diffMat = tile(inX, (dataSetSize, 1)) - dataSet       # tile??????
        sqDiffMat = diffMat**2
        # 求差的平方和
        sqDistance = sqDiffMat.sum(axis=1)        #axis=1???????sum函数默认是axis=0列元素相加,axis=1是一行的元素求和
        #  求标准差
        distances = sqDistance**0.5
        # 距离排序
        sortDistIndicies = distances.argsort()    #argsort函数返回的是数组值从小到大的索引值
        # 定义元字典
        classCount = {}
        for i in range(k):
            # 获得前k个元素的标签
            voteIlabel = labels[sortDistIndicies[i]]
            # 计算前k个数据标签出现的次数
            classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1           #dict.get()???????????
        sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1), reverse=True)
        return sortedClassCount[0][0]
    
    
    # 读取文本文件数据
    def file2matrix(filename):
        fr = open(filename)
        lines = fr.readlines()
        num_lines = len(lines)
        train_matrix = zeros((num_lines, 3))
        label_vector = []
        index = 0
        for line in lines:
            line = line.strip()
            line_list = line.split('	')
            train_matrix[index, :] = line_list[0:3]        # 获取列表的前0,1,2列
            label_vector.append(int(line_list[-1]))       # 获取列表的最后一列
            index += 1
        return train_matrix, label_vector                 # add_subplot????????????????
    
    
    #归一化函数
    def autoNorm(dataSet):
        minVals = dataSet.min(0)
        maxVals = dataSet.max(0)
        ranges = maxVals - minVals
        # normDataSet = zeros(shape(dataSet))
        m = dataSet.shape[0]
        normDataSet = dataSet - tile(minVals, (m, 1))      # minVals在列上重复一次(本身),在行上重复m次,从而形成m*3的向量
        normDataSet = normDataSet/tile(ranges, (m, 1))
        return normDataSet, ranges, minVals       # 归一化后的数据, 极差范围, 最小值
    
    
    # 分类器测试函数
    def datingClassTest():
        hoRatio = 0.10      #测试集比例
        datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
        normMat, ranges, minVals = autoNorm(datingDataMat)
        m = normMat.shape[0]
        numTestVecs = int(m*hoRatio)
        errcount = 0.0
        for i in range(numTestVecs):
            classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 2)
            print("the classifier came back with :%d ,the real answer is :%d" % (classifierResult, datingLabels[i]))
            if(classifierResult != datingLabels[i]):
                errcount += 1.0
            print("the total error rate is: %f" %(errcount/float(numTestVecs)))
    
    
    # 手写字符文件转换成向量
    def img2vector(filename):
        returnVect = zeros((1, 1024))
        fr = open(filename)
        for i in range(32):
            lineStr = fr.readline()
            for j in range(32):
                returnVect[0, 32*i+j] = int(lineStr[j])
        return returnVect
    
    
    # 手写字符识别测试
    def handwritingClassTest():
        hwlabels = []           # 定义手写字符标签
        trainingFileList = listdir('digits/trainingDigits')
        m = len(trainingFileList)
        trainingMat = zeros((m, 1024))
        for i in range(m):
            fileNameStr = trainingFileList[i]
            fileStr = fileNameStr.split('.')[0]
            classNumStr = fileStr.split('_')[0]
            hwlabels.append(classNumStr)
            # 把文件变成向量并赋值到trainingMat
            trainingMat[i, :] = img2vector('digits/trainingDigits/%s' % fileNameStr)
        testFileList = listdir('digits/testDigits')
        errcount = 0.0
        mTest = len(testFileList)
        for i in range(mTest):
            fileNameStr = testFileList[i]
            fileStr = fileNameStr.split('.')[0]
            classNumStr = int(fileStr.split('_')[0])
            vectorUnderTest = img2vector('digits/testDigits/%s' % fileNameStr)
            classifierResult = classify0(vectorUnderTest, trainingMat, hwlabels, 3)
            print('the classifier came back with : %d, the real answer is %d' % (int(classifierResult), classNumStr))
            if(int(classifierResult) != int(classNumStr)):
                errcount += 1
        print('
    the total number of errors is %d' % errcount)
        print('
    the total error rate is: %f' % float(errcount/mTest))
    
  • 相关阅读:
    火爆全网的合成大西瓜小游戏魔改版大全
    [Qt]cmake下Qt隐藏console的窗口
    c# WebBrowser控制台输出执行js后的网页内容
    好的编程习惯是减少bug最有效的方法
    创建线程 出现SIGSEGV crash
    linux下进程创建/僵尸进程/孤儿进程
    C++实现不可被继承的类
    程序并发概述
    C++ vector实现原理
    C++深拷贝和浅拷贝
  • 原文地址:https://www.cnblogs.com/myblog1993/p/8886459.html
Copyright © 2020-2023  润新知