#coding:utf-8 from numpy import * import re def createlist(lst):#将所有文本放入一个列表中 listt=set([]) for line in lst: listt=listt|set(line) return list(listt) def word2vec(List,inputset):#将输入文本转为词向量,每个文本对应一个词向量,其长度为上述列表长度 lenth=len(List) vec=[0]*lenth for word in inputset: if word in List: vec[List.index(word)]+=1 return vec def singprob(trainmatrix,label):#计算在已知类别的条件下,每个单词出现的概率对应于p(w1|c1),p(w2|c1)..... lenth=len(trainmatrix) numword=len(trainmatrix[0]) pb1=sum(label)/float(lenth) p0num=ones(numword) p1num=ones(numword) p0all=2 p1all=2 for i in range(lenth): if label[i]==1: p1num+=trainmatrix[i] p1all+=sum(trainmatrix[i]) else: p0num+=trainmatrix[i] p0all+=sum(trainmatrix[i]) p1vect=log(p1num/p1all) p0vect=log(p0num/p0all) return p1vect,p0vect,pb1 def classifier(vect,p1vect,p0vect,pb1):#对应公式:lnp(w1|c=1 or 0)p(w2/c=1 or 0)..p(wn/c=1 0r 0)p(c) p1=sum(vect*p1vect)+log(pb1) p0=sum(vect*p0vect)+log(1-pb1) if p1>p0: return 1 else: return 0 def testparse(str): reg=re.compile('W*') line=reg.split(str) List=[tt.lower for tt in line if len(tt)>2] return List def Test(): doc=[] label=[] fulltext=[] for i in range(1,26): wordlist=testparse(open("email/spam/%d.txt" %i).read()) doc.append(wordlist) fulltext.extend(wordlist) label.append(1) wordlist=testparse(open("email/ham/%d.txt" %i).read()) doc.append(wordlist) fulltext.extend(wordlist) label.append(0) doclist=createlist(doc) trainingset=range(50) testset=[] for i in range(10): index=int(random.uniform(0,len(trainingset))) testset.append(index) del(trainingset[index]) trainmat=[] classlabel=[] for docindex in trainingset: trainmat.append(word2vec(doclist,doc[docindex])) classlabel.append(label[docindex]) p1,p0,pb=singprob(trainmat,classlabel) error=0 for testindex in testset: wordvect=word2vec(doclist,doc[testindex]) if classifier(wordvect,p1,p0,pb)!=label[testindex]: error+=1 print error/float(len(testset)) Test()