一、在PyCharm 5.0.4(编写python程序的IDE) 编写kNN.py文件的代码
--------------------------
1、 kNN.py 运算符模块
--------------------------
1 from numpy import * 2 import operator 3 4 #运算符模块 创建数据集和标签 5 def createDataSet(): 6 group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) 7 labels = ['A', 'A', 'B', 'B'] 8 return group, labels
1)、打开命令行
先进入kNN.py的所在文件夹,在对kNN.py进行程序处理,效果如下图所示
--------------------------
2、 kNN.py k-近邻算法
--------------------------
1 from numpy import * 2 import operator 3 4 #运算符模块 5 def createDataSet(): 6 group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) 7 labels = ['A', 'A', 'B', 'B'] 8 return group, labels 9 10 #k-近邻算法 此模块需要使用运算符模块的group/labels 11 def classify0(inX, dataSet, labels, k): 12 dataSetSize = dataSet.shape[0] 13 diffMat = tile(inX, (dataSetSize, 1)) - dataSet 14 sqDiffMat = diffMat ** 2 15 sqDistances = sqDiffMat.sum(axis=1) 16 distances = sqDistances ** 0.5 17 sortedDistIndicies = distances.argsort() 18 classCount = {} 19 for i in range(k): 20 voteIlabel = labels[sortedDistIndicies[i]] 21 classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 22 23 sortedClassCount = sorted(classCount.iteritems(), 24 key=operator.itemgetter(1), 25 reverse=True) 26 return sortedClassCount[0][0]
2) 在1)的基础上,进行2)的内容,可以修改参数
--------------------------
3、 kNN.py 准备数据:从文本文件中解析数据
--------------------------
1 # 将文本记录转换为NumPy的解析程序 2 def file2matrix(filename): 3 fr = open(filename,'r') 4 numberOfLines = len(fr.readlines()) # get the number of lines in the file 5 returnMat = zeros((numberOfLines, 3)) # prepare matrix to return 6 classLabelVector = [] # prepare labels return 7 fr = open(filename) 8 index = 0 9 for line in fr.readlines(): 10 line = line.strip() 11 listFromLine = line.split(' ') 12 returnMat[index, :] = listFromLine[0:3] 13 classLabelVector.append(int(listFromLine[-1])) 14 index += 1 15 return returnMat, classLabelVector
datingTestSet2.txt文件可以从(博客园)文件下载
--------------------------
4、 kNN.py 使用Matplotlib创建散点图
--------------------------
datingTestSet2.txt 文件的数据通过matplotlib,图形化的表现出来
--------------------------
5、 kNN.py 使用Matplotlib创建散点图 表示不同属性的点,使用不同颜色进行表示
警告:import os
from numpy import *
这两句必须加上,否则会报如下,错误提示信息:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
NameError: name 'array' is not defined
--------------------------
让点出现颜色划分的关键代码是:
ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels))
--------------------------
6、 kNN.py 归一化特征值
--------------------------
1 # 归一化特征值
2 def autoNorm(dataSet):
3 minVals = dataSet.min(0)
4 maxVals = dataSet.max(0)
5 ranges = maxVals - minVals
6 normDataSet = zeros(shape(dataSet))
7 m = dataSet.shape[0]
8 normDataSet = dataSet - tile(minVals, (m, 1))
9 normDataSet = normDataSet / tile(ranges, (m, 1))
10 return normDataSet, ranges, minVals
--------------------------
7、 kNN.py 分类器针对约会网站的测试代码
--------------------------
1 # 分类器针对约会网站的测试代码 2 def datingClassTest(): 3 hoRatio = 0.10 4 datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') 5 normMat, ranges, minVals = autoNorm(datingDataMat) 6 m = normMat.shape[0] 7 numTestVecs = int(m * hoRatio) 8 errorCount = 0.0 9 for i in range(numTestVecs): 10 classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3) 11 print "the classifier came back with: %d, the real answer is : %d" % (classifierResult, datingLabels[i]) 12 13 if (classifierResult != datingLabels[i]): 14 errorCount += 1.0 15 16 print "the total error rate is : %f " % (errorCount / float(numTestVecs))
--------------------------
8、 kNN.py 使用算法:构建完整可用系统
--------------------------
1 #! /usr/bin/env python 2 # -*- coding: gbk -*- 用来解决中文乱码的注解 3 4 from numpy import * 5 import operator 6 7 # 运算符模块 8 def createDataSet(): 9 group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) 10 labels = ['A', 'A', 'B', 'B'] 11 return group, labels 12 13 14 # k-近邻算法 15 def classify0(inX, dataSet, labels, k): 16 dataSetSize = dataSet.shape[0] 17 diffMat = tile(inX, (dataSetSize, 1)) - dataSet 18 sqDiffMat = diffMat ** 2 19 sqDistances = sqDiffMat.sum(axis=1) 20 distances = sqDistances ** 0.5 21 sortedDistIndicies = distances.argsort() 22 classCount = {} 23 for i in range(k): 24 voteIlabel = labels[sortedDistIndicies[i]] 25 classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 26 27 sortedClassCount = sorted(classCount.iteritems(), 28 key=operator.itemgetter(1), 29 reverse=True) 30 return sortedClassCount[0][0] 31 32 33 # 将文本记录转换为NumPy的解析程序 34 def file2matrix(filename): 35 fr = open(filename, 'r') 36 numberOfLines = len(fr.readlines()) # get the number of lines in the file 37 returnMat = zeros((numberOfLines, 3)) # prepare matrix to return 38 classLabelVector = [] # prepare labels return 39 fr = open(filename) 40 index = 0 41 for line in fr.readlines(): 42 line = line.strip() 43 listFromLine = line.split(' ') 44 returnMat[index, :] = listFromLine[0:3] 45 classLabelVector.append(int(listFromLine[-1])) 46 index += 1 47 return returnMat, classLabelVector 48 49 50 # 归一化特征值 51 def autoNorm(dataSet): 52 minVals = dataSet.min(0) 53 maxVals = dataSet.max(0) 54 ranges = maxVals - minVals 55 normDataSet = zeros(shape(dataSet)) 56 m = dataSet.shape[0] 57 normDataSet = dataSet - tile(minVals, (m, 1)) 58 normDataSet = normDataSet / tile(ranges, (m, 1)) 59 return normDataSet, ranges, minVals 60 61 62 # 分类器针对约会网站的测试代码 63 def datingClassTest(): 64 hoRatio = 0.10 65 datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') 66 normMat, ranges, minVals = autoNorm(datingDataMat) 67 m = normMat.shape[0] 68 numTestVecs = int(m * hoRatio) 69 errorCount = 0.0 70 for i in range(numTestVecs): 71 classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3) 72 print "分类器的回复是:%d,真正的答案是:%d" % (classifierResult, datingLabels[i]) 73 74 if (classifierResult != datingLabels[i]): 75 errorCount += 1.0 76 77 print "总误差率 : %f " % (errorCount / float(numTestVecs)) 78 79 80 # 约会网预测函数 81 def classifyPerson(): 82 resultList = ['完全没有兴趣', '有一点吧', '特别感兴趣'] 83 percentTats = float(raw_input("玩电子游戏的时间百分比?")) 84 ffMiles = float(raw_input("每年的飞行里程数是多少?")) 85 iceCream = float(raw_input("每年的冰淇淋消费量是多少?")) 86 datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') 87 norMat, ranges, minVals = autoNorm(datingDataMat) 88 inArr = array([ffMiles, percentTats, iceCream]) 89 classifierResult = classify0((inArr - minVals) / ranges, norMat, datingLabels, 3) 90 print "你可能会喜欢这个人 :", resultList[classifierResult - 1]
--------------------------
9、 kNN.py 准备数据:将图像转换为测试向量
--------------------------
1 # 准备数据:将图像转换为测试向量 2 def img2vector(filename): 3 returnVect = zeros((1, 1024)) 4 fr = open(filename) 5 for i in range(32): 6 lineStr = fr.readline() 7 for j in range(32): 8 returnVect[0, 32 * i + j] = int(lineStr[j]) 9 return returnVect
--------------------------
10、 kNN.py 测试算法:使用k-近邻算法识别手写数字
注意:本文需要使用 from os import listdir
数据digits.zip 存放在博客园的文件夹中,或者下载《机器学习实战》的源代码,里面有
--------------------------
1 #! /usr/bin/env python 2 # -*- coding: gbk -*- 3 4 from numpy import * 5 import operator 6 from os import listdir 7 8 # 测试算法:使用K-近邻算法识别手写数字 9 def handwritingClassTest(): 10 hwLabels = [] 11 trainingFileList = listdir('trainingDigits') 12 m = len(trainingFileList) 13 trainingMat = zeros((m, 1024)) 14 for i in range(m): 15 fileNameStr = trainingFileList[i] 16 fileStr = fileNameStr.split('.')[0] 17 classNumStr = int(fileStr.split('_')[0]) 18 hwLabels.append(classNumStr) 19 trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr) 20 21 testFileList = listdir('testDigits') 22 errorCount = 0.0 23 mTest = len(testFileList) 24 for i in range(mTest): 25 fileNameStr = testFileList[i] 26 fileStr = fileNameStr.split('.')[0] 27 classNameStr = int(fileStr.split('_')[0]) 28 vectorUnderTest = img2vector('testDigits/%s' % fileNameStr) 29 classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) 30 print " 分类器的回复是:%d,真正的答案是:%d" % (classifierResult, classNumStr) 31 32 if (classifierResult != classNumStr): 33 errorCount += 1.0 34 35 print " 错误的总数是 : %d " % errorCount 36 print " 总误差率是 : %f" % (errorCount / float(mTest))
|
|
图片太长,其中截断了,读者可以自行测试看看效果
|
|
-------------------------------------------------------------------------------------------------
总结:以上就是机器学习实战的第二章的代码内容,没想到,三月份开始学习的内容,等到六月份才开始能够成功实现,主要是Numpy的安装,太狗了!
其间,学习安装Numpy与.whl类型的文件,会使用基本的matplotlib。k-近邻算法的模样还没有认清楚。接下来,进行决策树,过一段时间就可以
认识k-近邻算法了吧