#!/usr/bin/env python # -*- coding:utf-8 -*- from numpy import * #创建一个设置的单词列表 def createVocabList(dataSet): vocabSet=set([]) for docment in dataSet: vocabSet=vocabSet| set(docment) #并集 return list(vocabSet) #传送列表 def bagOfWords2Vec(vocabList,inputSet): returnVec=[0]*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)]+=1 else: print ("the word is not in my vocabulry") return returnVec # 训练算法 # p1Num 是计算每一类每个词包含的权重 def train(trainMat,trainGategory): numTrain=len(trainMat) numWords=len(trainMat[0]) #计算长度 pAbusive=sum(trainGategory)/float(numTrain) p0Num=ones(numWords);p1Num=ones(numWords) p0Denom=2.0;p1Denom=2.0 for i in range(numTrain): if trainGategory[i] == 1: p1Num += trainMat[i] p1Denom += sum(trainMat[i]) else: p0Num += trainMat[i] p0Denom +=sum(trainMat[i]) p1Vec=log(p1Num/p1Denom) p0Vec=log(p0Num/p0Denom) return p0Vec,p1Vec,pAbusive # 分类函数 def classfy(vec2classfy,p0Vec,p1Vec,pClass1): p1=sum(vec2classfy*p1Vec)+log(pClass1) p0=sum(vec2classfy*p0Vec)+log(1-pClass1) if p1 > p0: return 1 else: return 0 # 分开较大的字符串 def textParse(bigString): import re listOfTokens=re.split(r'W*',bigString) return [tok.lower() for tok in listOfTokens if len(tok)>2] #spam email classfy def spamTest(): fullTest=[];docList=[];classList=[] for i in range(1,26): #垃圾邮件和正常邮件各有25个 wordList=textParse(open('email/spam/%d.txt' % i,encoding='gbk',errors='ignore').read()) docList.append(wordList) fullTest.extend(wordList) classList.append(1) wordList=textParse(open('email/ham/%d.txt' % i,encoding='gbk',errors='ignore').read()) docList.append(wordList) fullTest.extend(wordList) classList.append(0) vocabList=createVocabList(docList) # create vocabulry trainSet=list(range(50)) testSet=[] #选择10个样本进行测试,训练指标 for i in range(10): randIndex=int(random.uniform(0,len(trainSet)))#总数 0-49 testSet.append(trainSet[randIndex]) del(trainSet[randIndex]) trainMat=[];trainClass=[] for docIndex in trainSet: trainMat.append(bagOfWords2Vec(vocabList,docList[docIndex])) trainClass.append(classList[docIndex]) p0,p1,pSpam=train(array(trainMat),array(trainClass)) errCount=0 for docIndex in testSet: wordVec=bagOfWords2Vec(vocabList,docList[docIndex]) if classfy(array(wordVec),p0,p1,pSpam) != classList[docIndex]: errCount +=1 print ("classfication error", docList[docIndex]) print ("the error rate is ", float(errCount)/len(testSet)) if __name__ == '__main__': spamTest()