• 垃圾邮件分类-贝叶斯


    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    from numpy import *
    
    #创建一个设置的单词列表
    def createVocabList(dataSet):
        vocabSet=set([])
        for docment in dataSet:
            vocabSet=vocabSet| set(docment) #并集
        return list(vocabSet) #传送列表
    
    
    def bagOfWords2Vec(vocabList,inputSet):
        returnVec=[0]*len(vocabList)
        for word in inputSet:
            if word in vocabList:
                returnVec[vocabList.index(word)]+=1
            else: print ("the word is not in my vocabulry")
        return returnVec
    
    # 训练算法
    # p1Num 是计算每一类每个词包含的权重
    def train(trainMat,trainGategory):
        numTrain=len(trainMat)
        numWords=len(trainMat[0])  #计算长度
        pAbusive=sum(trainGategory)/float(numTrain)
        p0Num=ones(numWords);p1Num=ones(numWords)
        p0Denom=2.0;p1Denom=2.0
        for i in range(numTrain):
            if trainGategory[i] == 1:
                p1Num += trainMat[i] 
                p1Denom += sum(trainMat[i])
            else:
                p0Num += trainMat[i]
                p0Denom +=sum(trainMat[i])
        p1Vec=log(p1Num/p1Denom)
        p0Vec=log(p0Num/p0Denom)
        return p0Vec,p1Vec,pAbusive
    # 分类函数
    def classfy(vec2classfy,p0Vec,p1Vec,pClass1):
        p1=sum(vec2classfy*p1Vec)+log(pClass1)
        p0=sum(vec2classfy*p0Vec)+log(1-pClass1)
        if p1 > p0:
            return 1
        else:
            return 0
    
    # 分开较大的字符串
    def textParse(bigString):
        import re
        listOfTokens=re.split(r'W*',bigString)
        return [tok.lower() for tok in listOfTokens if len(tok)>2]
    
    #spam email classfy
    def spamTest():
        fullTest=[];docList=[];classList=[]
        for i in range(1,26): #垃圾邮件和正常邮件各有25个
            wordList=textParse(open('email/spam/%d.txt' % i,encoding='gbk',errors='ignore').read())
            docList.append(wordList)
            fullTest.extend(wordList)
            classList.append(1)
            wordList=textParse(open('email/ham/%d.txt' % i,encoding='gbk',errors='ignore').read())
            docList.append(wordList)
            fullTest.extend(wordList)
            classList.append(0)
        vocabList=createVocabList(docList)   # create vocabulry
        trainSet=list(range(50))
        testSet=[]
    #选择10个样本进行测试,训练指标
        for i in range(10):
            randIndex=int(random.uniform(0,len(trainSet)))#总数 0-49
            testSet.append(trainSet[randIndex])
            del(trainSet[randIndex])
        trainMat=[];trainClass=[]
        for docIndex in trainSet:
            trainMat.append(bagOfWords2Vec(vocabList,docList[docIndex]))
            trainClass.append(classList[docIndex])
        p0,p1,pSpam=train(array(trainMat),array(trainClass))
        errCount=0
        for docIndex in testSet:
            wordVec=bagOfWords2Vec(vocabList,docList[docIndex])
            if classfy(array(wordVec),p0,p1,pSpam) != classList[docIndex]:
                errCount +=1
                print ("classfication error", docList[docIndex])
               
        print ("the error rate is ", float(errCount)/len(testSet))
    
    if __name__ == '__main__':
        spamTest()
             
  • 相关阅读:
    域控软件分发
    win2008 ad域控搭建
    tomcat部署web项目的三种方式
    sql server2008数据库迁移的两种方案
    WinServer2008R2远程桌面长时间保持连接
    Windows2012R2备用域控搭建
    主备 主从 主主模式
    excel中汉字转拼音
    正向代理与反向代理
    18-09-11 软件rpm yum rm卸载 和批量删除
  • 原文地址:https://www.cnblogs.com/lifengwu/p/10031012.html
Copyright © 2020-2023  润新知