//数据挖掘&机器学习实践
1.简单的手写数字识别
原理:可以先通过多次手写的图片生成训练集,然后利用knn就行了……代码如下(下面代码需要安装PIL/numpy库,PIL安装有点坑==官方的貌似还有点问题。。。)
1 //knn.py 2 from numpy import * 3 import operator 4 import os 5 6 def createDataSet(): 7 group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) 8 labels = ['A', 'A', 'B', 'B'] 9 return group, labels 10 11 def classify0(inX, dataSet, labels, k): 12 dataSetSize = dataSet.shape[0] 13 diffMat = tile(inX, (dataSetSize, 1)) - dataSet 14 sqDiffMat = diffMat**2 15 sqDistances = sqDiffMat.sum(axis=1) 16 distances = sqDistances**0.5 17 sortedDistIndicies = distances.argsort() 18 classCount={} 19 for i in range(k): 20 voteIlabel = labels[sortedDistIndicies[i]] 21 classCount[voteIlabel] = classCount.get(voteIlabel,0)+1 22 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) 23 return sortedClassCount[0][0] 24 25 def file2matrix(filename): 26 fr = open(filename) 27 numberOfLines = len(fr.readlines()) #get the number of lines in the file 28 returnMat = zeros((numberOfLines,3)) #prepare matrix to return 29 classLabelVector = [] #prepare labels return 30 fr = open(filename) 31 index = 0 32 for line in fr.readlines(): 33 line = line.strip() 34 listFromLine = line.split(' ') 35 returnMat[index,:] = listFromLine[0:3] 36 classLabelVector.append(round(float(listFromLine[-1]))) 37 index += 1 38 return returnMat,classLabelVector 39 40 def autoNorm(dataSet): 41 minVals = dataSet.min(0) 42 maxVals = dataSet.max(0) 43 ranges = maxVals-minVals 44 normDataSet = zeros(shape(dataSet)) 45 m = dataSet.shape[0] 46 normDataSet = dataSet-tile(minVals, (m, 1)) 47 normDataSet = normDataSet/tile(ranges, (m, 1)) 48 return normDataSet, ranges, minVals 49 50 def datingClassTest(): 51 hoRatio = 0.1 52 datingDataMat, datingLabels = file2matrix('C:\Python27\source.txt') 53 normMat, ranges, minVals = autoNorm(datingDataMat) 54 m = normMat.shape[0] 55 numTestVecs = int(m*hoRatio) 56 errorCount = 0 57 for i in range(numTestVecs): 58 classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m, :], 59 datingLabels[numTestVecs:m], 3); 60 print "result :%d, the real answer is:%d" % (classifierResult, datingLabels[i]) 61 if (classifierResult != datingLabels[i]): errorCount += 1.0 62 print "the total error rate is : %f" % (errorCount/float(numTestVecs)) 63 64 def img2vector(filename): 65 returnVect = zeros((1, 1024)) 66 fr = open(filename, 'r') 67 for i in range(32): 68 lineStr = fr.readline() 69 for j in range(32): 70 returnVect[0, 32*i+j] = int(lineStr[j]) 71 return returnVect 72 73 def handwritingClassTest(): 74 hwLabels = [] 75 trainingFileList = os.listdir('D:\PDF_BOOK\ML\machinelearninginaction\Ch02\traing\trainingDigits') 76 m = len(trainingFileList) 77 trainingMat = zeros((m, 1024)) 78 for i in range(m): 79 fileNameStr = trainingFileList[i] 80 fileStr = fileNameStr.split('.')[0] 81 classNumStr = int(fileStr.split('_')[0]) 82 hwLabels.append(classNumStr) 83 trainingMat[i,:] = img2vector('D:\PDF_BOOK\ML\machinelearninginaction\Ch02\traing\trainingDigits\%s' % fileNameStr) 84 testFileList = os.listdir('D:\PDF_BOOK\ML\machinelearninginaction\Ch02\traing\testDigits') 85 errorCount = 0.0 86 mTest = len(testFileList) 87 for i in range(mTest): 88 fileNameStr = testFileList[i] 89 fileStr = fileNameStr.split('.')[0] 90 classNumStr = int(fileStr.split('_')[0]) 91 vectorUnderTest = img2vector('D:\PDF_BOOK\ML\machinelearninginaction\Ch02\traing\testDigits\%s' % fileNameStr) 92 classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) 93 print "the classifier came back with:%d, the real answer is:%d" % (classifierResult, classNumStr) 94 if (classifierResult != classNumStr): errorCount += 1.0 95 print " the total number of errors is %d" % errorCount 96 print " the total error rate is %f" % (errorCount/float(mTest)) 97 98 def judgeClass(filename): 99 hwLabels = [] 100 trainingFileList = os.listdir('D:\PDF_BOOK\ML\machinelearninginaction\Ch02\traing\trainingDigits') 101 m = len(trainingFileList) 102 trainingMat = zeros((m, 1024)) 103 for i in range(m): 104 fileNameStr = trainingFileList[i] 105 fileStr = fileNameStr.split('.')[0] 106 classNumStr = int(fileStr.split('_')[0]) 107 hwLabels.append(classNumStr) 108 trainingMat[i, : ] = img2vector('D:\PDF_BOOK\ML\machinelearninginaction\Ch02\traing\trainingDigits\%s' % fileNameStr) 109 testVector = img2vector(filename) 110 result = classify0(testVector, trainingMat, hwLabels, 3) 111 return result
1 import Image, ImageDraw, ImageFont, ImageFilter 2 import random 3 import array 4 from numpy import * 5 import sys 6 import knn 7 8 s = '2' 9 10 filename = 'C:\Users\lg\Desktop\' + s + '.txt' 11 im = Image.open('C:\Users\lg\Desktop\'+s+'.png') 12 f = open(filename, 'w+') 13 14 data = zeros(32*32) 15 for x in range(32): 16 for y in range(32): 17 val = 1 18 if im.getpixel((x, y)) == (255, 255, 255): 19 val = 0 20 data[32*y+x] = val 21 22 for x in range(32): 23 for y in range(32): 24 f.write(str(int(data[32*x+y]))) 25 if (x != 31): f.write(' ') 26 f.close() 27 print knn.judgeClass(filename)
后续的功能或者识别加强慢慢再补==,先挖坑,慢慢填。。。