• DM/ML学习实践(一)


    //数据挖掘&机器学习实践

    1.简单的手写数字识别

    原理:可以先通过多次手写的图片生成训练集,然后利用knn就行了……代码如下(下面代码需要安装PIL/numpy库,PIL安装有点坑==官方的貌似还有点问题。。。)

      1 //knn.py
      2 from numpy import *
      3 import operator
      4 import os
      5 
      6 def createDataSet():
      7     group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
      8     labels = ['A', 'A', 'B', 'B']
      9     return group, labels
     10 
     11 def classify0(inX, dataSet, labels, k):
     12     dataSetSize = dataSet.shape[0]
     13     diffMat = tile(inX, (dataSetSize, 1)) - dataSet
     14     sqDiffMat = diffMat**2
     15     sqDistances = sqDiffMat.sum(axis=1)
     16     distances = sqDistances**0.5
     17     sortedDistIndicies = distances.argsort()
     18     classCount={}
     19     for i in range(k):
     20         voteIlabel = labels[sortedDistIndicies[i]]
     21         classCount[voteIlabel] = classCount.get(voteIlabel,0)+1
     22     sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
     23     return sortedClassCount[0][0]
     24 
     25 def file2matrix(filename):
     26     fr = open(filename)
     27     numberOfLines = len(fr.readlines())         #get the number of lines in the file
     28     returnMat = zeros((numberOfLines,3))        #prepare matrix to return
     29     classLabelVector = []                       #prepare labels return
     30     fr = open(filename)
     31     index = 0
     32     for line in fr.readlines():
     33         line = line.strip()
     34         listFromLine = line.split('	')
     35         returnMat[index,:] = listFromLine[0:3]
     36         classLabelVector.append(round(float(listFromLine[-1])))
     37         index += 1
     38     return returnMat,classLabelVector
     39 
     40 def autoNorm(dataSet):
     41     minVals = dataSet.min(0)
     42     maxVals = dataSet.max(0)
     43     ranges = maxVals-minVals
     44     normDataSet = zeros(shape(dataSet))
     45     m = dataSet.shape[0]
     46     normDataSet = dataSet-tile(minVals, (m, 1))
     47     normDataSet = normDataSet/tile(ranges, (m, 1))
     48     return normDataSet, ranges, minVals
     49 
     50 def datingClassTest():
     51     hoRatio = 0.1
     52     datingDataMat, datingLabels = file2matrix('C:\Python27\source.txt')
     53     normMat, ranges, minVals = autoNorm(datingDataMat)
     54     m = normMat.shape[0]
     55     numTestVecs = int(m*hoRatio)
     56     errorCount = 0
     57     for i in range(numTestVecs):
     58         classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m, :],
     59                                      datingLabels[numTestVecs:m], 3);
     60         print "result :%d, the real answer is:%d" % (classifierResult, datingLabels[i])
     61         if (classifierResult != datingLabels[i]): errorCount += 1.0
     62     print "the total error rate is : %f" % (errorCount/float(numTestVecs))
     63 
     64 def img2vector(filename):
     65     returnVect = zeros((1, 1024))
     66     fr = open(filename, 'r')
     67     for i in range(32):
     68         lineStr = fr.readline()
     69         for j in range(32):
     70             returnVect[0, 32*i+j] = int(lineStr[j])
     71     return returnVect
     72 
     73 def handwritingClassTest():
     74     hwLabels = []
     75     trainingFileList = os.listdir('D:\PDF_BOOK\ML\machinelearninginaction\Ch02\traing\trainingDigits')
     76     m = len(trainingFileList)
     77     trainingMat = zeros((m, 1024))
     78     for i in range(m):
     79         fileNameStr = trainingFileList[i]
     80         fileStr = fileNameStr.split('.')[0]
     81         classNumStr = int(fileStr.split('_')[0])
     82         hwLabels.append(classNumStr)
     83         trainingMat[i,:] = img2vector('D:\PDF_BOOK\ML\machinelearninginaction\Ch02\traing\trainingDigits\%s' % fileNameStr)
     84     testFileList = os.listdir('D:\PDF_BOOK\ML\machinelearninginaction\Ch02\traing\testDigits')
     85     errorCount = 0.0
     86     mTest = len(testFileList)
     87     for i in range(mTest):
     88         fileNameStr = testFileList[i]
     89         fileStr = fileNameStr.split('.')[0]
     90         classNumStr = int(fileStr.split('_')[0])
     91         vectorUnderTest = img2vector('D:\PDF_BOOK\ML\machinelearninginaction\Ch02\traing\testDigits\%s' % fileNameStr)
     92         classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
     93         print "the classifier came back with:%d, the real answer is:%d" % (classifierResult, classNumStr)
     94         if (classifierResult != classNumStr): errorCount += 1.0
     95     print "
    the total number of errors is %d" % errorCount
     96     print "
    the total error rate is %f" % (errorCount/float(mTest))
     97 
     98 def judgeClass(filename):
     99     hwLabels = []
    100     trainingFileList = os.listdir('D:\PDF_BOOK\ML\machinelearninginaction\Ch02\traing\trainingDigits')
    101     m = len(trainingFileList)
    102     trainingMat = zeros((m, 1024))
    103     for i in range(m):
    104         fileNameStr = trainingFileList[i]
    105         fileStr = fileNameStr.split('.')[0]
    106         classNumStr = int(fileStr.split('_')[0])
    107         hwLabels.append(classNumStr)
    108         trainingMat[i, : ] = img2vector('D:\PDF_BOOK\ML\machinelearninginaction\Ch02\traing\trainingDigits\%s' % fileNameStr)
    109     testVector = img2vector(filename)
    110     result = classify0(testVector, trainingMat, hwLabels, 3)
    111     return result
    knn.py
     1 import Image, ImageDraw, ImageFont, ImageFilter
     2 import random
     3 import array
     4 from numpy import *
     5 import sys
     6 import knn
     7 
     8 s = '2'
     9 
    10 filename = 'C:\Users\lg\Desktop\' + s + '.txt'
    11 im = Image.open('C:\Users\lg\Desktop\'+s+'.png')
    12 f = open(filename, 'w+')
    13 
    14 data = zeros(32*32)
    15 for x in range(32):
    16     for y in range(32):
    17         val = 1
    18         if im.getpixel((x, y)) == (255, 255, 255):
    19             val = 0
    20         data[32*y+x] = val
    21 
    22 for x in range(32):
    23     for y in range(32):
    24         f.write(str(int(data[32*x+y])))
    25     if (x != 31): f.write('
    ')
    26 f.close()
    27 print knn.judgeClass(filename)
    test.py

    后续的功能或者识别加强慢慢再补==,先挖坑,慢慢填。。。

  • 相关阅读:
    利用runtime特性来动态调用方法
    点击屏幕获取对应tableviewcell
    IOS7导航栏与状态栏融合适配方法之一
    推送证书生成.p12
    OpenGL基础学习杂文
    android入门1.1
    java基础
    “Oracle.DataAccess.Client.OracleConnection”的类型初始值设定项引发异常。
    ArcEngine栅格和矢量渲染(含可视化颜色带)
    【转载】C#如何操控FTP,获取FTP文件或文件夹列表,获取FTP文件大小,FTP上传,FTP删除文件,FTP新建文件夹、删除文件夹
  • 原文地址:https://www.cnblogs.com/JustForCS/p/4900593.html
Copyright © 2020-2023  润新知