• 机器学习之--朴素贝叶斯应用之判断 好评 差评


    import re
    import numpy as np
    
    def file_do(filename):
        with open(filename,'rb') as f:
            data = f.read().decode()
            data = data.split('
    ')
        # print('dasdqaw:',data)
        words = []
        labels = []
        for i in data:
            # print('i:',i)
            data1 = re.findall(r'[u4e00-u9fff]+', i)              #提取每行的中文 不要英文 和字符
            if data1 != []:
                # print('data1:',data1)
                words.append(data1[1:])
                labels.append(data1[0])
        print('内words:',words)
        print('内labels:',labels)
        # print(len(words),len(labels))
        return words,labels
    # filename = 'C:/Users/cuit/Desktop/文本测试数据/training-1000.txt'
    # words,labels = file_do(filename)
    
    #创建训练样本的词汇表
    def createvacablist(words):
        vacablist = set()
        for row in words:
            vacablist = vacablist | set(row)
        # print(vacablist)
        return list(vacablist)
    # vacablist = createvacablist(words)
    # print('外vacablist:',vacablist)
    # print(vacablist.index('愤怒'))
    
    #将数据变为 0,1 向量集
    def setdata(vacablist,inputdata):
        result = np.zeros(len(vacablist))
        for i in inputdata:
            if i in vacablist:
                result[vacablist.index(i)] += 1
            else:
                print('词汇表里没有 :',i)
        # print('inputdata',inputdata)
        # print('result:',result)
        return result
    # setdata(vacablist,words[0])
    
    #训练函数,计算没个词在每个标签的不同概率
    def P1(words,labels,vacablist):
        #先计算标签中好评概率
        labels_1count = 0                                       # 1为好评 0为差评
        for i in labels:
            if i == '好评':
                labels_1count +=1
        plabels_1 = labels_1count/len(labels)
        print('plabels_1:',plabels_1)
        #将训练数据再统一成词汇表格式的数据
        setwords = []
        for i in words:
            setwords.append(setdata(vacablist,i))
        # print('setwords:',setwords)
        count_1 = 2                            # 好评中词语的个数
        count_0 = 2
        data_1 = np.ones(len(vacablist))
        data_0 = np.ones(len(vacablist))
        for i in range(len(setwords)):
            if labels[i] == '好评':
                data_1 = data_1 + setwords[i]
                count_1 += sum(setwords[i])
            else:
                data_0 = data_0 + setwords[i]
                count_0 += sum(setwords[i])
        # print('data_1:',data_1)
        # print('data_0:', data_0)
        print('count_1:{},count_0:{}'.format(count_1,count_0))
        data_1 = (data_1/count_1) * plabels_1
        data_0 = (data_0/count_0) * (1-plabels_1)
        print('data_1:', data_1)
        print('data_0:', data_0)
        return data_1,data_0
    
    # data_1,data_0 = P1(words,labels,vacablist)
    # print('data_1:',data_1)
    # print('data_0:',data_0)
    
    def classfy(data_1,data_0,vacablist,test_data):
        p1 = 1
        p0 = 1
        set_tdata = setdata(vacablist,test_data)
        # print('set_tdata:',set_tdata)
        for i in range(len(set_tdata)):
            if set_tdata[i] != 0:
                # print('i:{},data_1[i]:{},data_0[i]:{},set_tdata[i]:{}'.format(i,data_1[i],data_0[i],set_tdata[i]))
                p1 = p1 * data_1[i] * set_tdata[i]
                p0 = p0 * data_0[i] * set_tdata[i]
        if p1>p0:
            # print('好评')
            return '好评'
        if p1 <p0:
            # print('差评')
            return '差评'
        else:
            # print('p1:{}.p0:{}'.format(p1,p0))
            return -1
    
    # mydata = ['这个','酒店','马马虎虎']
    # classfy(data_1,data_0,vacablist,mydata)
    
    #测试文档预测
    def test():
        #训练数据
        filename = 'C:/Users/cuit/Desktop/文本测试数据/training-1000.txt'
        words,labels = file_do(filename)
        print('len(words):',len(words))
        vacablist = createvacablist(words)
        data_1, data_0 = P1(words, labels, vacablist)
        #测试数据转格式
        print()
        filename = 'C:/Users/cuit/Desktop/文本测试数据/test-1000.txt'
        test_words,test_labels = file_do(filename)
        yes_count = 0
        for i in range(len(test_words)):
            result = classfy(data_1,data_0,vacablist,test_words[i])             # '好评' 或 '差评'
            # print('result:{},labels[i]:{}'.format(result,test_labels[i]))
            if result == test_labels[i]:
                yes_count += 1
        corect = yes_count/len(test_labels)
        print('正确了{}个,总共有{}个,准确率为{}'.format(yes_count,len(test_labels),corect))
    
    test()

  • 相关阅读:
    编译环境及编译器介绍
    linux下同步window的firefox
    DPDK pdump抓包说明
    linux TCP协议(1)---连接管理与状态机
    Linux用户态数据发送和接收
    DPDK之内存管理
    linux socket系统调用层
    linux网络栈结构
    DPDK mbuf何时释放回内存池?
    虚拟设备之linux网桥
  • 原文地址:https://www.cnblogs.com/cxhzy/p/10655498.html
Copyright © 2020-2023  润新知