• 基于朴素贝叶斯分类的用户情感分类系统


    对用户的电影评论进行情感分类

    首先输入豆瓣网址,对电影评论进行爬取

    然后讲爬取到的评论存入Excel表格

    对Excel表格中的数据进行清洗

    将数据存入mysql数据库

    对数据进行分词,词频统计

    调用贝叶斯算法进行情感分类

    打印好评和差评

    存入数据库

    完毕!

    # -*-coding:utf-8-*-
    
    import urllib.request
    from bs4 import BeautifulSoup
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt  #绘制图像的模块
    import pandas as pd
    from numpy import *
    import jieba
    import xlwt
    import codecs
    from pylab import mpl
    import os
    import pymysql
    pymysql.install_as_MySQLdb()
    from sqlalchemy import create_engine
    
    def getHtml(url):
        """获取url页面"""
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
        req = urllib.request.Request(url,headers=headers)
        req = urllib.request.urlopen(req)
        content = req.read().decode('utf-8')
        return content
    
    def getComment(url):
        """解析HTML页面"""
        html = getHtml(url)
        soupComment = BeautifulSoup(html, 'html.parser')
    
        comments = soupComment.findAll('span', 'short')
        onePageComments = []
        for comment in comments:
            # print(comment.getText()+'\n')
            onePageComments.append(comment.getText()+'\n')
    
        return onePageComments
    
    #爬取豆瓣电影评论
    def function1():
        print("请输入要爬取评论的电影的网址(豆瓣)")
        url11=input()
        url1=str(url11)
        f = open('G:/好评.txt', 'w', encoding='utf-8')
        for page in range(200):  # 豆瓣爬取多页评论需要验证。
            ur=url1[33:41]
            url = 'https://movie.douban.com/subject/'+str(ur)+'/comments?start=' + str(
              1 * page) + '&limit=20&sort=new_score&status=P&percent_type=h'
            print('第%s页的评论:' % (page + 1))
            print(url + '\n')
            for i in getComment(url):
                f.write(i)
                print(i)
            print('\n')
        f2=open('G:/差评.txt','w',encoding='utf-8')
        for page2 in range(200):
            url='https://movie.douban.com/subject/26752088/comments?start=' + str(1*page2) + '&limit=20&sort=new_score&status=P&percent_type=l'
            print('第%s页的评论:' %(page2+1))
            print(url+'\n')
            for i in getComment(url):
                f2.write(i)
                print(i)
            print('\n')
    
    #文本转Excel
    def function2():
        file = open("G:/comments/好评.txt", "r", encoding="UTF-8")
        a1 = file.readlines()
        workbook1 = xlwt.Workbook(encoding="UTF-8")
        worksheet1 = workbook1.add_sheet('ltq')
        for i in range(len(a1)):
            worksheet1.write(i, 0, a1[i])
            print(a1[i])
        workbook1.save('G:/comments/好评.xls')
    
        file = open("G:/comments/差评.txt", "r", encoding="UTF-8")
        a2 = file.readlines()
        workbook2 = xlwt.Workbook(encoding="UTF-8")
        worksheet2 = workbook2.add_sheet('ltq')
        for i in range(len(a2)):
            worksheet2.write(i, 0, a2[i])
            print(a2[i])
        workbook2.save('G:/comments/差评.xls')
    
        # 贝叶斯对评论进行分类
    
    def function3():
        #         第一步 读取数据及分词
        #
        data = pd.read_csv("G:/comments/comments.csv")
        # print(data)
    
        # 取表中的第1列的所有值
        print("获取第一列内容")
        col = data.iloc[:, 1]
        # 取表中所有值
        arrs = col.values
    
        # 去除停用词
        #stopwords = {}.fromkeys([',', '。', '!', '这', '我', '非常'])
        stopwords = [line.strip() for line in open("G:/comments/中文停用词表.txt",encoding="utf-8").readlines()]
        #print("\n中文分词后结果:")
        corpus = []
        for a in arrs:
            # print a
            seglist = jieba.cut(a, cut_all=False)  # 精确模式
            final = ''
            for seg in seglist:
                seg = seg.encode('utf-8')
                if seg not in stopwords:  # 不是停用词的保留
                    final += seg.decode()
            seg_list = jieba.cut(final, cut_all=False)
            output = ' '.join(list(seg_list))  # 空格拼接
            #print(output)
            corpus.append(output)
        ####################################
        #         第二步 计算词频
        #
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.feature_extraction.text import TfidfTransformer
    
        vectorizer = CountVectorizer()  # 将文本中的词语转换为词频矩阵
        X = vectorizer.fit_transform(corpus)  # 计算个词语出现的次数
        word = vectorizer.get_feature_names()  # 获取词袋中所有文本关键词
        for w in word:  # 查看词频结果
            print(w)
        print('')
        print(X.toarray())
    
        ####################################
        #         第三步 数据分析
        from sklearn.naive_bayes import MultinomialNB
        from sklearn.metrics import precision_recall_curve
        from sklearn.metrics import classification_report
    
        # 使用前4000行数据集进行训练,最后n数据集用于预测
    
        print(u"\n\n数据分析:")
        X = X.toarray()
        x_train = X[:4000]
        x_test = X[4000:]
        # 1表示好评 0表示差评
        # y_train = [1,1,0,0,1,0,0,1]
        y_train1 = data['Fraction'].tolist()
        y_train2 = y_train1[:4000]
        y_train = array(y_train2)
    
        print(y_train)
        y_test = [1, 0]
    
        # 调用MultinomialNB分类器
        clf = MultinomialNB().fit(x_train, y_train)
        pre = clf.predict(x_test)
    
        print("1表示好评,0表示差评")
        print("评论预测结果为:_____________________________________________________________________________________________________________________________________________")
        com_list1 = data['comment'].tolist()
        com_list2 = com_list1[4000:]
        j = 0
        for i in com_list2:
            print(i, "   :", pre[j])
            # print(pre[j])
            j = j + 1
    
        # 输出好评:
        print("是否输出全部好评? 1:是  0:否")
        selectone=input()
        selectone=int(selectone)
        if selectone==1:
            print("查看所有好评_________________________________________________________________________________________________________________________________________________")
            j = 0
            for i in com_list2:
                pre[j] = int(pre[j])
                if pre[j] == 1:
                    print(i)
                j = j + 1
        #输出差评:
        print("是否输出所有差评? 1:是  0:否")
        selectsecond=input()
        selectsecond=int(selectsecond)
        if selectsecond==1:
            print("查看所有差评—————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————")
            j = 0
            for i in com_list2:
                pre[j] = int(pre[j])
                if pre[j] == 0:
                    print(i)
                j = j + 1
        # print(u"预测结果:",pre)
        # print(u"真实结果:",y_test)
    
        from sklearn.metrics import classification_report
        print(classification_report(y_test, pre))
    
    # 生成直方图
    
    def function4():
        mpl.rcParams['font.sans-serif'] = ['FangSong']  # 指定默认字体
        mpl.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题
        plt.rcParams['font.sans-serif'] = ['SimHei']
    
        txt = open("G:/差评.txt", encoding="utf-8").read()
        # 加载停用词表
        stopwords = [line.strip() for line in open("G:/comments/中文停用词表.txt", encoding="utf-8").readlines()]
        words = jieba.lcut(txt)
        counts = {}
        for word in words:
            # 不在停用词表中
            if word not in stopwords:
                # 不统计字数为一的词
                if len(word) == 1:
                    continue
                else:
                    counts[word] = counts.get(word, 0) + 1
        items = list(counts.items())
        items.sort(key=lambda x: x[1], reverse=True)
        for i in range(50):
            word, count = items[i]
            print("{:<10}{:>7}".format(word, count))
        label = list(map(lambda x: x[0], items[:10]))
        value = list(map(lambda y: y[1], items[:10]))
    
        plt.bar(range(len(value)), value, tick_label=label)
        plt.savefig("G:/filename.png")
        # plt.show()
    
        # 打开数据库连接
    
        # 读取图片文件
        # fp = open("test.jpg",'rb',encoding='utf-8')
        fp = open("G:/filename.png", 'rb')
        img = fp.read()
        fp.close()
        db = pymysql.connect("localhost", "root", "liutaiqing", "testdb", charset='utf8')
        # 使用cursor()方法获取操作游标
        cursor = db.cursor()
        # 注意使用Binary()函数来指定存储的是二进制
        # cursor.execute("INSERT INTO demo_pic_repo SET touxiang_data= %s" % pymysql.Binary(img))
        sql = "INSERT INTO demo_pic_repo (touxiang_data_blob) VALUES  (%s)"
        cursor.execute(sql, img)
        # 提交,不然无法保存新建或者修改的数据
        db.commit()
        # 关闭游标
        cursor.close()
        # 关闭数据库连接
        db.close()
    
    
    #生成词云
    def function5():
        # 差评词云
        path_txt2 = 'G:/comments/差评.txt'
        f2 = open(path_txt2, 'r', encoding='UTF-8').read()
        cut_text2 = " ".join(jieba.cut(f2))
        wordcloud2 = WordCloud(
            # 设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的
            font_path="C:/Windows/Fonts/simfang.ttf",
            # 设置了背景,宽高
            background_color="white", width=1100, height=1000).generate(cut_text2)
        plt.imshow(wordcloud2, interpolation="bilinear")
        plt.axis("off")
        # plt.show()
        wordcloud2.to_file("G:/词云图片.jpg")
    
    
        # 打开数据库连接
    
        # 读取图片文件
        # fp = open("test.jpg",'rb',encoding='utf-8')
        fp = open("G:/词云图片.jpg", 'rb')
        img = fp.read()
        fp.close()
        db = pymysql.connect("localhost", "root", "liutaiqing", "testdb", charset='utf8')
        # 使用cursor()方法获取操作游标
        cursor = db.cursor()
        # 注意使用Binary()函数来指定存储的是二进制
        # cursor.execute("INSERT INTO demo_pic_repo SET touxiang_data= %s" % pymysql.Binary(img))
        sql = "INSERT INTO demo_pic_repo (touxiang_data_blob) VALUES  (%s)"
        cursor.execute(sql, img)
        # 提交,不然无法保存新建或者修改的数据
        db.commit()
        # 关闭游标
        cursor.close()
        # 关闭数据库连接
        db.close()
    
    
    
    def function6():
        #好评词云
        path_txt1 = 'G:/comments/好评.txt'
        f1 = open(path_txt1, 'r', encoding='UTF-8').read()
        # 结巴分词,生成字符串,wordcloud无法直接生成正确的中文词云
        cut_text1 = " ".join(jieba.cut(f1))
    
        wordcloud1 = WordCloud(
            # 设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的
            font_path="C:/Windows/Fonts/simfang.ttf",
            # 设置了背景,宽高
            background_color="white", width=1100, height=1000).generate(cut_text1)
    
        plt.imshow(wordcloud1, interpolation="bilinear")
        plt.axis("off")
        # plt.show()
        wordcloud1.to_file("G:/词云图片.jpg")
        # 打开数据库连接
    
        # 读取图片文件
        # fp = open("test.jpg",'rb',encoding='utf-8')
        fp = open("G:/词云图片.jpg", 'rb')
        img = fp.read()
        fp.close()
        db = pymysql.connect("localhost", "root", "liutaiqing", "testdb", charset='utf8')
        # 使用cursor()方法获取操作游标
        cursor = db.cursor()
        # 注意使用Binary()函数来指定存储的是二进制
        # cursor.execute("INSERT INTO demo_pic_repo SET touxiang_data= %s" % pymysql.Binary(img))
        sql = "INSERT INTO demo_pic_repo (touxiang_data_blob) VALUES  (%s)"
        cursor.execute(sql, img)
        # 提交,不然无法保存新建或者修改的数据
        db.commit()
        # 关闭游标
        cursor.close()
        # 关闭数据库连接
        db.close()
    
    
    

    这是本人大二写的课题,从python零基础到课题完成,大概经历了一个月。如果疑问请评论,如果感觉有帮助请三连支持,多谢!!!

  • 相关阅读:
    NGINX新手小白快速上手Centos搭建Nginx服务器
    Nginx+SSL+gunicorn+gevent+Django的配置
    nginxhttpflvmodule 的部署
    gunicorn+Django加载静态文件
    正确利用 ADO.NET
    五种提高 SQL 性能的方法
    关于能够触发BFC特性的属性,以及它们各自带来的副作用
    PID库与PID基本优化(四)
    PID库与PID基本优化(二)
    Mahony姿态解算算法笔记(一)
  • 原文地址:https://www.cnblogs.com/dreamzj/p/14323906.html
Copyright © 2020-2023  润新知