• 机器学习实战源码使用朴素贝叶斯进行文档分类


     1 import numpy as np
     2 
     3 def loadDataSet():
     4     postingList = [["my","dog","has","flea",
     5                     "problems","help","please"],
     6                    ["maybe","not","take","him",
     7                     "to","dog","park","stupid"],
     8                    ["my","dalmation","is","so","cute",
     9                     "I","love","him"],
    10                    ["stop","posting","stupid","worthless","garbage",],
    11                    ["my","licks","ate","my","steak","how",
    12                     "to","stop","him"],
    13                    ["qiut","buying","worthless","dog","food","stupid"]]
    14     classVec = [0,1,0,1,0,1]
    15     return postingList,classVec
    16 
    17 def createVocabList(dataSet):
    18     vocabSet = set([])
    19     for document in dataSet:
    20         vocabSet = vocabSet | set(document)
    21     return list(vocabSet)
    22 
    23 def setOfWords2Vec(vocabList,inputSet):
    24     returnVec = [0] * len(vocabList)
    25     for word in inputSet:
    26         if word in vocabList:
    27             returnVec[vocabList.index(word)] = 1
    28         else:
    29             print "the word: %s is not in my Vocabulary!" % word
    30     return returnVec
    31 
    32 def trainNB0(trainMatrix,trainCategory):
    33     numTrainDocs = len(trainMatrix)
    34     numWords = len(trainMatrix[0])
    35     pAbusive = sum(trainCategory) / float(numTrainDocs)
    36     p0Num = np.ones(numWords);p1Num = np.ones(numWords)
    37     p0Denom = 2.0;p1Denom = 2.0
    38     for i in range(numTrainDocs):
    39         if trainCategory[i] == 1:
    40             p1Num += trainMatrix[i]
    41             p1Denom += np.sum(trainMatrix[i])
    42         else:
    43             p0Num += trainMatrix[i]
    44             p0Denom += np.sum(trainMatrix[i])
    45     p1Vect = np.log(p1Num / p1Denom)
    46     p0Vect = np.log(p0Num / p0Denom)
    47     return p0Vect,p1Vect,pAbusive
    48 
    49 def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    50     p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1)
    51     p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    52     if p1 > p0:
    53         return 1
    54     else:
    55         return 0
    56 
    57 def testingNB():
    58     listOPosts,listClasses = loadDataSet()
    59     myVocabList = createVocabList(listOPosts)
    60     trainMat = []
    61     for postinDoc in listOPosts:
    62         trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    63     p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses))
    64     testEntry = ["love","my","dalmation"]
    65     thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
    66     print testEntry,"classified as :",classifyNB(thisDoc,p0V,p1V,pAb)
    67     testEntry = ["stupid","garbage"]
    68     thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
    69     print testEntry,"classified as :",classifyNB(thisDoc,p0V,p1V,pAb)
    70 
    71 def bagOfWords2VecMN(vocabList,inputSet):
    72     returnVec = [0] * len(vocabList)
    73     for word in inputSet:
    74         if word in vocabList:
    75             returnVec[vocabList.index(word)] += 1
    76     return returnVec
    77 
    78 if __name__ == "__main__":
    79     print testingNB()
  • 相关阅读:
    Memcached 笔记与总结(5)Memcached 的普通哈希分布和一致性哈希分布
    大数据价值挖掘:聚焦商业模式探索
    微博商业数据挖掘方法
    微博商业数据挖掘方法
    如何用大数据探索市场营销实践
    如何用大数据探索市场营销实践
    数据挖掘基本任务
    数据挖掘基本任务
    医疗大数据分析深入浅出
    医疗大数据分析深入浅出
  • 原文地址:https://www.cnblogs.com/guochangyu/p/7737652.html
Copyright © 2020-2023  润新知