• 【风马一族_Python】 实施kNN算法


    一、在PyCharm 5.0.4(编写python程序的IDE) 编写kNN.py文件的代码

    --------------------------

    1、 kNN.py  运算符模块

    --------------------------

    1 from numpy import *
    2 import operator
    3 
    4 #运算符模块   创建数据集和标签
    5 def createDataSet():
    6     group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
    7     labels = ['A', 'A', 'B', 'B']
    8     return group, labels

    1)、打开命令行

      先进入kNN.py的所在文件夹,在对kNN.py进行程序处理,效果如下图所示

    --------------------------

    2、 kNN.py  k-近邻算法

    --------------------------

     1 from numpy import *
     2 import operator
     3 
     4 #运算符模块
     5 def createDataSet():
     6     group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
     7     labels = ['A', 'A', 'B', 'B']
     8     return group, labels
     9 
    10 #k-近邻算法  此模块需要使用运算符模块的group/labels
    11 def classify0(inX, dataSet, labels, k):
    12     dataSetSize = dataSet.shape[0]
    13     diffMat = tile(inX, (dataSetSize, 1)) - dataSet
    14     sqDiffMat = diffMat ** 2
    15     sqDistances = sqDiffMat.sum(axis=1)
    16     distances = sqDistances ** 0.5
    17     sortedDistIndicies = distances.argsort()
    18     classCount = {}
    19     for i in range(k):
    20         voteIlabel = labels[sortedDistIndicies[i]]
    21         classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    22 
    23     sortedClassCount = sorted(classCount.iteritems(),
    24                               key=operator.itemgetter(1),
    25                               reverse=True)
    26     return sortedClassCount[0][0]

    2) 在1)的基础上,进行2)的内容,可以修改参数
      检测k-近邻算法的效果

      修改参数

    --------------------------

    3、 kNN.py  准备数据:从文本文件中解析数据

    --------------------------

     1 # 将文本记录转换为NumPy的解析程序
     2 def file2matrix(filename):
     3     fr = open(filename,'r')
     4     numberOfLines = len(fr.readlines())  # get the number of lines in the file
     5     returnMat = zeros((numberOfLines, 3))  # prepare matrix to return
     6     classLabelVector = []  # prepare labels return
     7     fr = open(filename)
     8     index = 0
     9     for line in fr.readlines():
    10         line = line.strip()
    11         listFromLine = line.split('	')
    12         returnMat[index, :] = listFromLine[0:3]
    13         classLabelVector.append(int(listFromLine[-1]))
    14         index += 1
    15     return returnMat, classLabelVector

     datingTestSet2.txt文件可以从(博客园)文件下载

     

    --------------------------

    4、 kNN.py  使用Matplotlib创建散点图

    --------------------------

    datingTestSet2.txt 文件的数据通过matplotlib,图形化的表现出来

    --------------------------

    5、 kNN.py  使用Matplotlib创建散点图  表示不同属性的点,使用不同颜色进行表示

        警告:import os

           from numpy import *

           这两句必须加上,否则会报如下,错误提示信息:

            Traceback (most recent call last):
                File "<stdin>", line 1, in <module>
                NameError: name 'array' is not defined

    --------------------------

    让点出现颜色划分的关键代码是:

    ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels))

    --------------------------

    6、 kNN.py  归一化特征值

    --------------------------

     1 # 归一化特征值
     2 def autoNorm(dataSet):
     3     minVals = dataSet.min(0)
     4     maxVals = dataSet.max(0)
     5     ranges = maxVals - minVals
     6     normDataSet = zeros(shape(dataSet))
     7     m = dataSet.shape[0]
     8     normDataSet = dataSet - tile(minVals, (m, 1))
     9     normDataSet = normDataSet / tile(ranges, (m, 1))
    10     return normDataSet, ranges, minVals

    --------------------------

    7、 kNN.py   分类器针对约会网站的测试代码

    --------------------------

     1 # 分类器针对约会网站的测试代码
     2 def datingClassTest():
     3     hoRatio = 0.10
     4     datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
     5     normMat, ranges, minVals = autoNorm(datingDataMat)
     6     m = normMat.shape[0]
     7     numTestVecs = int(m * hoRatio)
     8     errorCount = 0.0
     9     for i in range(numTestVecs):
    10         classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
    11         print "the classifier came back with: %d, the real answer is : %d" % (classifierResult, datingLabels[i])
    12 
    13         if (classifierResult != datingLabels[i]):
    14             errorCount += 1.0
    15 
    16     print  "the total error rate is : %f " % (errorCount / float(numTestVecs))

    --------------------------

    8、 kNN.py   使用算法:构建完整可用系统

    --------------------------

     1 #! /usr/bin/env python
     2 # -*- coding: gbk -*-        用来解决中文乱码的注解
     3 
     4 from numpy import *
     5 import operator
     6 
     7 # 运算符模块
     8 def createDataSet():
     9     group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
    10     labels = ['A', 'A', 'B', 'B']
    11     return group, labels
    12 
    13 
    14 # k-近邻算法
    15 def classify0(inX, dataSet, labels, k):
    16     dataSetSize = dataSet.shape[0]
    17     diffMat = tile(inX, (dataSetSize, 1)) - dataSet
    18     sqDiffMat = diffMat ** 2
    19     sqDistances = sqDiffMat.sum(axis=1)
    20     distances = sqDistances ** 0.5
    21     sortedDistIndicies = distances.argsort()
    22     classCount = {}
    23     for i in range(k):
    24         voteIlabel = labels[sortedDistIndicies[i]]
    25         classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    26 
    27     sortedClassCount = sorted(classCount.iteritems(),
    28                               key=operator.itemgetter(1),
    29                               reverse=True)
    30     return sortedClassCount[0][0]
    31 
    32 
    33 # 将文本记录转换为NumPy的解析程序
    34 def file2matrix(filename):
    35     fr = open(filename, 'r')
    36     numberOfLines = len(fr.readlines())  # get the number of lines in the file
    37     returnMat = zeros((numberOfLines, 3))  # prepare matrix to return
    38     classLabelVector = []  # prepare labels return
    39     fr = open(filename)
    40     index = 0
    41     for line in fr.readlines():
    42         line = line.strip()
    43         listFromLine = line.split('	')
    44         returnMat[index, :] = listFromLine[0:3]
    45         classLabelVector.append(int(listFromLine[-1]))
    46         index += 1
    47     return returnMat, classLabelVector
    48 
    49 
    50 # 归一化特征值
    51 def autoNorm(dataSet):
    52     minVals = dataSet.min(0)
    53     maxVals = dataSet.max(0)
    54     ranges = maxVals - minVals
    55     normDataSet = zeros(shape(dataSet))
    56     m = dataSet.shape[0]
    57     normDataSet = dataSet - tile(minVals, (m, 1))
    58     normDataSet = normDataSet / tile(ranges, (m, 1))
    59     return normDataSet, ranges, minVals
    60 
    61 
    62 # 分类器针对约会网站的测试代码
    63 def datingClassTest():
    64     hoRatio = 0.10
    65     datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
    66     normMat, ranges, minVals = autoNorm(datingDataMat)
    67     m = normMat.shape[0]
    68     numTestVecs = int(m * hoRatio)
    69     errorCount = 0.0
    70     for i in range(numTestVecs):
    71         classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
    72         print "分类器的回复是:%d,真正的答案是:%d" % (classifierResult, datingLabels[i])
    73 
    74         if (classifierResult != datingLabels[i]):
    75             errorCount += 1.0
    76 
    77     print "总误差率  : %f " % (errorCount / float(numTestVecs))
    78 
    79 
    80 # 约会网预测函数
    81 def classifyPerson():
    82     resultList = ['完全没有兴趣', '有一点吧', '特别感兴趣']
    83     percentTats = float(raw_input("玩电子游戏的时间百分比?"))
    84     ffMiles = float(raw_input("每年的飞行里程数是多少?"))
    85     iceCream = float(raw_input("每年的冰淇淋消费量是多少?"))
    86     datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
    87     norMat, ranges, minVals = autoNorm(datingDataMat)
    88     inArr = array([ffMiles, percentTats, iceCream])
    89     classifierResult = classify0((inArr - minVals) / ranges, norMat, datingLabels, 3)
    90     print "你可能会喜欢这个人 :", resultList[classifierResult - 1]

    --------------------------

    9、 kNN.py   准备数据:将图像转换为测试向量

    --------------------------

    1 # 准备数据:将图像转换为测试向量
    2 def img2vector(filename):
    3     returnVect = zeros((1, 1024))
    4     fr = open(filename)
    5     for i in range(32):
    6         lineStr = fr.readline()
    7         for j in range(32):
    8             returnVect[0, 32 * i + j] = int(lineStr[j])
    9     return returnVect

    --------------------------

    10、 kNN.py   测试算法:使用k-近邻算法识别手写数字

          注意:本文需要使用  from os import listdir

          数据digits.zip 存放在博客园的文件夹中,或者下载《机器学习实战》的源代码,里面有

    --------------------------

     1 #! /usr/bin/env python
     2 # -*- coding: gbk -*-
     3 
     4 from numpy import *
     5 import operator
     6 from os import listdir
     7 
     8 # 测试算法:使用K-近邻算法识别手写数字
     9 def handwritingClassTest():
    10     hwLabels = []
    11     trainingFileList = listdir('trainingDigits')
    12     m = len(trainingFileList)
    13     trainingMat = zeros((m, 1024))
    14     for i in range(m):
    15         fileNameStr = trainingFileList[i]
    16         fileStr = fileNameStr.split('.')[0]
    17         classNumStr = int(fileStr.split('_')[0])
    18         hwLabels.append(classNumStr)
    19         trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr)
    20 
    21     testFileList = listdir('testDigits')
    22     errorCount = 0.0
    23     mTest = len(testFileList)
    24     for i in range(mTest):
    25         fileNameStr = testFileList[i]
    26         fileStr = fileNameStr.split('.')[0]
    27         classNameStr = int(fileStr.split('_')[0])
    28         vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
    29         classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
    30         print "
    分类器的回复是:%d,真正的答案是:%d" % (classifierResult, classNumStr)
    31 
    32         if (classifierResult != classNumStr):
    33             errorCount += 1.0
    34 
    35         print "	 错误的总数是 : %d " % errorCount
    36         print "	 总误差率是 : %f" % (errorCount / float(mTest))

      |

      |

      图片太长,其中截断了,读者可以自行测试看看效果

      |

      |

    -------------------------------------------------------------------------------------------------

    总结:以上就是机器学习实战的第二章的代码内容,没想到,三月份开始学习的内容,等到六月份才开始能够成功实现,主要是Numpy的安装,太狗了!

      其间,学习安装Numpy与.whl类型的文件,会使用基本的matplotlib。k-近邻算法的模样还没有认清楚。接下来,进行决策树,过一段时间就可以

      认识k-近邻算法了吧

    每天完成一件事。 不管是为了什么。
  • 相关阅读:
    iOS之UITableView的上拉刷新
    iOS xml文件的解析方式 XMLDictionary,GDataXMLNode,NSXMLParser
    iOS学习基本常识
    iOS常用宏定义
    iOS查错机制
    轻量级sqlite是增删改查
    iOS开发UI篇—ios应用数据存储方式(归档) :转发
    iOS面向对象的建模:MVC(OC基础)
    iOS下bound,center和frame
    CSS----学习2
  • 原文地址:https://www.cnblogs.com/sows/p/5557009.html
Copyright © 2020-2023  润新知