• 朴素贝叶斯


    从词向量计算概率

    
    import numpy as np
    
    def loadDataSet():
        """
        实验样本
        :return: 第一个变量是进行词条切分后的文档集合,第二个变量是一个类别标签的集合
        """
        postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                       ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                       ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                       ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                       ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                       ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
        classVec = [0, 1, 0, 1, 0, 1]  # 人工标记 【0,stupid愚蠢,0,stupid愚蠢worthless垃圾garbage一文不值,0,worthless垃圾stupid愚蠢】   #1 is abusive, 0 not
        return postingList, classVec
    
    def createVocabList(dataSet):
        """
        创建一个包含所以文档中出现的不重复词的列表
        :param dataSet:
        :return:
        """
        vocabSet = set([])  #create empty set
        for document in dataSet:
            vocabSet = vocabSet | set(document)  # 并集 #union of the two sets
        return list(vocabSet)
    
    def setOfWords2Vec(vocabList, inputSet):
        """
    
        :param vocabList: 词汇表
        :param inputSet: 某个文档
        :return: 词汇表长度的列表,1表示出现,0没有出现
        """
        returnVec = [0]*len(vocabList)
        for word in inputSet:
            if word in vocabList:
                returnVec[vocabList.index(word)] = 1
            else: print("the word: %s is not in my Vocabulary!" % word)
        return returnVec
    
    def trainNB0(trainMatrix, trainCategory):
        """
        分类器训练
        :param trainMatrix: 文档矩阵 训练集
        :param trainCategory: 文档类别标签向量
        :return:
        """
        numTrainDocs = len(trainMatrix)
        numWords = len(trainMatrix[0])
        pAbusive = sum(trainCategory)/float(numTrainDocs)
        p0Num = np.zeros(numWords); p1Num = np.zeros(numWords)      #change to np.ones()
        # p0Num = np.ones(numWords); p1Num = np.ones(numWords)      #change to np.ones()
        p0Denom = 0.0; p1Denom = 0.0                        #change to 2.0
        # p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0
        for i in range(numTrainDocs):
            if trainCategory[i] == 1:
                p1Num += trainMatrix[i]
                p1Denom += sum(trainMatrix[i])
            else:
                p0Num += trainMatrix[i]
                p0Denom += sum(trainMatrix[i])
        p1Vect = p1Num/p1Denom          #change to np.log()
        # p1Vect = np.log(p1Num/p1Denom)          #change to np.log()
        p0Vect = p0Num/p0Denom         #change to np.log()
        # p0Vect = np.log(p0Num/p0Denom)          #change to np.log()
        return p0Vect, p1Vect, pAbusive
    
    if __name__ == '__main__':
        listOPosts, listClasses = loadDataSet()
        myVocabList = createVocabList(listOPosts)
        print(myVocabList)
    
        trainMat = []
        for postinDoc in listOPosts:
            trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
        print(trainMat)
        p0v, p1v, pAb = trainNB0(trainMat, listClasses)
        print(pAb)
        print(p0v)
        print(p1v)
    
    '''
    ['love', 'take', 'cute', 'so', 'flea', 'posting', 'stop', 'help', 'mr', 'stupid', 'ate', 'garbage', 'has', 'I', 'problems', 'licks', 'worthless', 'is', 'how', 'not', 'maybe', 'dalmation', 'food', 'buying', 'please', 'him', 'park', 'quit', 'steak', 'my', 'dog', 'to']
    [[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1], [1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0]]
    0.5
    [0.04166667 0.         0.04166667 0.04166667 0.04166667 0.
     0.04166667 0.04166667 0.04166667 0.         0.04166667 0.
     0.04166667 0.04166667 0.04166667 0.04166667 0.         0.04166667
     0.04166667 0.         0.         0.04166667 0.         0.
     0.04166667 0.08333333 0.         0.         0.04166667 0.125
     0.04166667 0.04166667]
    [0.         0.05263158 0.         0.         0.         0.05263158
     0.05263158 0.         0.         0.15789474 0.         0.05263158
     0.         0.         0.         0.         0.10526316 0.
     0.         0.05263158 0.05263158 0.         0.05263158 0.05263158
     0.         0.05263158 0.05263158 0.05263158 0.         0.
     0.10526316 0.05263158]
    
    
    cute 在类别0中出现1次,类别1中出现0次,对应的条件概率分别是0.04166667与0.
    
    p1v中最大概率是0.15789474对应stupid,这意味stupid是最能表征类别1的单词
    
    '''
    

    修改分类器

    计算多个概率的额乘积以获得文档属于某个类别的概率如果其中一个概率值为0,那么最后的乘积也为0,
    为降低这种影像,可以将所以词的出现数初始化为1,将分母初始化为2
    p0Num = np.ones(numWords); p1Num = np.ones(numWords)
    p0Denom = 2.0; p1Denom = 2.0
    下溢出问题,由于太多很小的数相乘造成的,通过求对数可以避免下溢出或浮点数舍入导致的错误
    p1Vect = np.log(p1Num/p1Denom)
    p0Vect = np.log(p0Num/p0Denom)

    分类函数

    def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
        """
        分类函数
        :param vec2Classify:
        :param p0Vec:
        :param p1Vec:
        :param pClass1:
        :return:
        """
        p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)    #element-wise mult
        p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
        if p1 > p0:
            return 1
        else:
            return 0
    

    测试

    def testingNB():
        listOPosts, listClasses = loadDataSet()
        myVocabList = createVocabList(listOPosts)
        trainMat = []
        for postinDoc in listOPosts:
            trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
        p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
        testEntry = ['love', 'my', 'dalmation']
        thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
        print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
        testEntry = ['stupid', 'garbage']
        thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
        print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
    
  • 相关阅读:
    安装DotNetCore.1.0.1-VS2015Tools.Preview2.0.2出现0x80072f8a未指定的错误
    [迷宫中的算法实践]迷宫生成算法——Prim算法
    [MVC学习笔记]7.使用极验验证来制作更高逼格的验证码
    Android UI--提高Android UI体验
    Genymotion的使用 -- A Faster Android Emulator
    【Xamarin】Visual Studio 2013 Xamarin for Android开发环境搭建与配置&Genymotion
    【ASP.NET Web API2】利用HttpClient调用Web API(TODO)
    VMware 11 安装 Mac OS X10.10
    ASP.NET MVC 缓存Outputcache (局部动态)
    【ASP.NET Web API2】初识Web API
  • 原文地址:https://www.cnblogs.com/fly-book/p/14215301.html
Copyright © 2020-2023  润新知