1 import numpy as np 2 3 def loadDataSet(): 4 postingList = [["my","dog","has","flea", 5 "problems","help","please"], 6 ["maybe","not","take","him", 7 "to","dog","park","stupid"], 8 ["my","dalmation","is","so","cute", 9 "I","love","him"], 10 ["stop","posting","stupid","worthless","garbage",], 11 ["my","licks","ate","my","steak","how", 12 "to","stop","him"], 13 ["qiut","buying","worthless","dog","food","stupid"]] 14 classVec = [0,1,0,1,0,1] 15 return postingList,classVec 16 17 def createVocabList(dataSet): 18 vocabSet = set([]) 19 for document in dataSet: 20 vocabSet = vocabSet | set(document) 21 return list(vocabSet) 22 23 def setOfWords2Vec(vocabList,inputSet): 24 returnVec = [0] * len(vocabList) 25 for word in inputSet: 26 if word in vocabList: 27 returnVec[vocabList.index(word)] = 1 28 else: 29 print "the word: %s is not in my Vocabulary!" % word 30 return returnVec 31 32 def trainNB0(trainMatrix,trainCategory): 33 numTrainDocs = len(trainMatrix) 34 numWords = len(trainMatrix[0]) 35 pAbusive = sum(trainCategory) / float(numTrainDocs) 36 p0Num = np.ones(numWords);p1Num = np.ones(numWords) 37 p0Denom = 2.0;p1Denom = 2.0 38 for i in range(numTrainDocs): 39 if trainCategory[i] == 1: 40 p1Num += trainMatrix[i] 41 p1Denom += np.sum(trainMatrix[i]) 42 else: 43 p0Num += trainMatrix[i] 44 p0Denom += np.sum(trainMatrix[i]) 45 p1Vect = np.log(p1Num / p1Denom) 46 p0Vect = np.log(p0Num / p0Denom) 47 return p0Vect,p1Vect,pAbusive 48 49 def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1): 50 p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1) 51 p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1) 52 if p1 > p0: 53 return 1 54 else: 55 return 0 56 57 def testingNB(): 58 listOPosts,listClasses = loadDataSet() 59 myVocabList = createVocabList(listOPosts) 60 trainMat = [] 61 for postinDoc in listOPosts: 62 trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) 63 p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses)) 64 testEntry = ["love","my","dalmation"] 65 thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry)) 66 print testEntry,"classified as :",classifyNB(thisDoc,p0V,p1V,pAb) 67 testEntry = ["stupid","garbage"] 68 thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry)) 69 print testEntry,"classified as :",classifyNB(thisDoc,p0V,p1V,pAb) 70 71 def bagOfWords2VecMN(vocabList,inputSet): 72 returnVec = [0] * len(vocabList) 73 for word in inputSet: 74 if word in vocabList: 75 returnVec[vocabList.index(word)] += 1 76 return returnVec 77 78 if __name__ == "__main__": 79 print testingNB()