1.基于贝叶斯决策理论的分类方法
2.使用朴素贝叶斯进行文档分类
使用Python进行文本分类
1.准备数据:从文本中构建词向量
def loadDataSet(): postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] classVec = [0,1,0,1,0,1] #1 is abusive, 0 not return postingList,classVec #第一个变量是词条切分后的文档集合,取出标点,第二个变量是类别标签的集合 def createVocabList(dataSet): #创建一个包含在所有文档中出现的不重复词的列表 vocabSet = set([]) #生成一个空的集合 for document in dataSet: vocabSet = vocabSet | set(document) #求两个集合的并集 return list(vocabSet) def setOfWords2Vec(vocabList, inputSet):#输入参数为词汇表vocabList及某个文档inputSet,输出的是文档向量,即文档中的每个词在词汇表中出现的次数 returnVec = [0]*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] = 1 else: print "the word: %s is not in my Vocabulary!" % word return returnVec
listOPosts,listClasses = loadDataSet() print "某个文档:" print listOPosts[0] myVocabList = createVocabList(listOPosts) #由词表生成向量 print "词汇表:" print myVocabList print "文档向量:" print setOfWords2Vec(myVocabList,listOPosts[0])
某个文档: ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'] 词汇表: ['cute', 'love', 'help', 'garbage', 'quit', 'I', 'problems', 'is', 'park', 'stop', 'flea', 'dalmation', 'licks', 'food', 'not', 'him', 'buying', 'posting', 'has', 'worthless', 'ate', 'to', 'maybe', 'please', 'dog', 'how', 'stupid', 'so', 'take', 'mr', 'steak', 'my'] 文档向量: [0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1]
2.训练算法:从词向量计算概率
#朴素贝叶斯分类器训练函数,trainMatrix表示由多个词条在词汇表中出现的次数组成的矩阵,trainCategory表示某个词条出不出现 def trainNB0(trainMatrix,trainCategory): numTrainDocs = len(trainMatrix) #输出词条的数量 numWords = len(trainMatrix[0]) #输出词汇表的数量 pAbusive = sum(trainCategory)/float(numTrainDocs) p0Num = ones(numWords); p1Num = ones(numWords) #change to ones() p0Denom = 2.0; p1Denom = 2.0 #change to 2.0 for i in range(numTrainDocs): #循环每一个词条,总计由这些词条组成的文档的概率 if trainCategory[i] == 1: #trainCategory表示某个词条是不是侮辱性的 p1Num += trainMatrix[i] #如果词条出现,增加该词条的计数值,p1Num是起始全为1的一维向量,如果是侮辱性的词条则加上该词条的词汇表计数 p1Denom += sum(trainMatrix[i]) #增加所有词条的计数值 else: p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) p1Vect = log(p1Num/p1Denom) #词汇表中每个词出现在侮辱性词条中的次数除以出现的总次数,再求log p0Vect = log(p0Num/p0Denom) #change to log() return p0Vect,p1Vect,pAbusive
from numpy import * listOPosts,listClasses = loadDataSet() myVocabList = createVocabList(listOPosts) #由词表生成向量 print "词汇表:" print myVocabList trainMat = [] for posinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList,posinDoc)) p0V,p1V,pAb = trainNB0(trainMat,listClasses) print pAb print "词汇表中的词出现在侮辱性词条中的概率:" print p1V
词汇表: ['cute', 'love', 'help', 'garbage', 'quit', 'I', 'problems', 'is', 'park', 'stop', 'flea', 'dalmation', 'licks', 'food', 'not', 'him', 'buying', 'posting', 'has', 'worthless', 'ate', 'to', 'maybe', 'please', 'dog', 'how', 'stupid', 'so', 'take', 'mr', 'steak', 'my'] 0.5 词汇表中的词出现在侮辱性词条中的概率: [ 0.04761905 0.04761905 0.04761905 0.0952381 0.0952381 0.04761905 0.04761905 0.04761905 0.0952381 0.0952381 0.04761905 0.04761905 0.04761905 0.0952381 0.0952381 0.0952381 0.0952381 0.0952381 0.04761905 0.14285714 0.04761905 0.0952381 0.0952381 0.04761905 0.14285714 0.04761905 0.19047619 0.04761905 0.0952381 0.04761905 0.04761905 0.04761905]
3.测试算法:根据现实情况修改分类器
朴素贝叶斯分类函数
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): #需要分类的向量vec2Classify,pClass1是类别的概率 p1 = sum(vec2Classify * p1Vec) + log(pClass1) #向量相乘,再加上类别的对数概率 p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) if p1 > p0: return 1 else: return 0 def testingNB(): #测试贝叶斯分类器 listOPosts,listClasses = loadDataSet() #载入训练数据 myVocabList = createVocabList(listOPosts) #取得词汇表 trainMat=[] for postinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) #向训练矩阵中添加训练概率向量 p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) #训练出的参数 testEntry = ['love', 'my', 'dalmation'] #测试词条1 thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) print testEntry,'分类为: ',classifyNB(thisDoc,p0V,p1V,pAb) testEntry = ['stupid', 'garbage'] #测试词条2 thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) print testEntry,'分类为: ',classifyNB(thisDoc,p0V,p1V,pAb)
['love', 'my', 'dalmation'] 分类为: 0 ['stupid', 'garbage'] 分类为: 1
4.准备数据:文档词袋模型
def bagOfWords2VecMN(vocabList, inputSet): #基于词袋模型的朴素贝叶斯,遇到一个单词时,增加词向量中的对应值 returnVec = [0]*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] += 1 #出现次数从1开始增加 return returnVec
示例:使用朴素贝叶斯过滤垃圾邮件
1.准备数据:切分文本
def textParse(bigString): #输入字符串,切分字符串成列表 import re listOfTokens = re.split(r'W*', bigString) return [tok.lower() for tok in listOfTokens if len(tok) > 2] #返回字符串长度大于2的小写字符串
mySent = 'This book is the best book on Python' print textParse(mySent)
['this', 'book', 'the', 'best', 'book', 'python']
切分一封完整的电子邮件的实际处理结果
import re regEx = re.compile('\W*') #正则表达式 emailText = open('email/ham/6.txt').read() listOfTokens = regEx.split(emailText) print listOfTokens
2.测试算法:使用朴素贝叶斯进行交叉验证
def spamTest(): docList=[]; classList = []; fullText =[] for i in range(1,26): wordList = textParse(open('email/spam/%d.txt' % i).read()) #导入并解析25个文本文件成字符串列表 docList.append(wordList) #把多个列表分组添加到一个列表中 fullText.extend(wordList) #把多个列表添加到一个列表中 classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = createVocabList(docList) #生成词汇表 trainingSet = range(50); testSet=[] #生成测试集合,trainingSet是从0到49的列表 for i in range(10): #生成一个长为10的随机列表作为测试集合 randIndex = int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat=[]; trainClasses = [] for docIndex in trainingSet: #原来0-49的数据用来训练分类器 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) #训练分类器 errorCount = 0 for docIndex in testSet: #分类测试数据 wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: errorCount += 1 print "classification error",docList[docIndex] print 'the error rate is: ',float(errorCount)/len(testSet) #return vocabList,fullText
classification error ['yeah', 'ready', 'may', 'not', 'here', 'because', 'jar', 'jar', 'has', 'plane', 'tickets', 'germany', 'for'] classification error ['yay', 'you', 'both', 'doing', 'fine', 'working', 'mba', 'design', 'strategy', 'cca', 'top', 'art', 'school', 'new', 'program', 'focusing', 'more', 'right', 'brained', 'creative', 'and', 'strategic', 'approach', 'management', 'the', 'way', 'done', 'today'] the error rate is: 0.2
示例:使用朴素贝叶斯分类器从个人广告中获取区域倾向
1.收集数据:导入RSS源及高频次去除函数
def calcMostFreq(vocabList,fullText): #计算出现的频率 import operator freqDict = {} for token in vocabList: freqDict[token]=fullText.count(token) sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedFreq[:30] def localWords(feed1,feed0): import feedparser docList=[]; classList = []; fullText =[] minLen = min(len(feed1['entries']),len(feed0['entries'])) for i in range(minLen): wordList = textParse(feed1['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(1) #NY is class 1 wordList = textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = createVocabList(docList) #生成词汇表 top30Words = calcMostFreq(vocabList,fullText) #移除出现次数前30的词 print "输出出现次数排名前30的词:" print top30Words for pairW in top30Words: if pairW[0] in vocabList: vocabList.remove(pairW[0]) #生成一个训练集合和测试集合 trainingSet = range(2*minLen); testSet=[] #长度为20的测试集合,元素随机取 for i in range(20): randIndex = int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat=[]; trainClasses = [] #训练 for docIndex in trainingSet: #添加训练矩阵和分类标签 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) #训练分类器 errorCount = 0 #分类判别到底是SF还是NY,并计算错误率 for docIndex in testSet: #classify the remaining items wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ',float(errorCount)/len(testSet) return vocabList,p0V,p1V
2.分析数据:显示地域相关的用词
def getTopWords(ny,sf): #最具表征性的词汇显示函数 import operator vocabList,p0V,p1V=localWords(ny,sf) topNY=[]; topSF=[] for i in range(len(p0V)): if p0V[i] > -4.5 : topSF.append((vocabList[i],p0V[i])) if p1V[i] > -4.5 : topNY.append((vocabList[i],p1V[i])) sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True) print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**" for item in sortedSF: print item[0] sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True) print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**" for item in sortedNY: print item[0]