对商品评价进行好评率分析

#本程序简单地使用分词、然后去停用词，对评价中带有正向和负向的词进行统计，以及对程度副词进行分析，得出商品的好评率
import re
import time
import random
import jieba
from collections import defaultdict
import chardet


def words():
    with open('BosonNLP_sentiment_score.txt', 'r', encoding='utf-8') as f:
        sentiList = f.readlines()
    # print(len(sentiList))
    SentiDict = defaultdict()
    # SentiDict=defaultdict()
    for s in sentiList:
        # print(s)
        s = s.strip('
')
        # print(s)
        # print(s.split(' ')[0])
        # print(s.split(' ')[1])
        # print('
')
        try:
            SentiDict[s.split(' ')[0]] = s.split(' ')[1]
        except:
            pass
    print(len(SentiDict))

    with open('NoList.txt', 'r', encoding='utf-8') as f:
        NotList = f.readlines()
        NotList2 = []
        for line in NotList:
            line = line.strip('
')
            # print(line)
            NotList2.append(line)
        # print(NotList2)
        print(len(NotList2))

    with open('DegreeList.txt', 'r', encoding='utf-8') as f:
        DegreeList = f.readlines()
        DegreeDict = defaultdict()
        # DegreeDict=defaultdict()
        n = 0
        Degree = [0, 2, 1.25, 1.2, 0.8, 0.5, 1.5]
        for d in DegreeList:
            d = d.strip('
')
            # print(d)
            cout = re.findall('”.*?(d+)', d)
            if len(cout):
                # print(cout)
                n = n + 1
                continue
            if n > 0:
                DegreeDict[d] = Degree[n]
        print(len(DegreeDict))  # 少了四个！！！
    return SentiDict, NotList2, DegreeDict


def classifywords(wordDict, SentiDict, NotList, DegreeDict):
    SentiWords = defaultdict()
    NotWords = defaultdict()
    DegreeWords = defaultdict()
    # print(wordDict)
    for word in wordDict.keys():
        if word in SentiDict.keys() and word not in NotList and word not in DegreeDict.keys():
            SentiWords[wordDict[word]] = SentiDict[word]
        elif word in NotList and word not in DegreeDict.keys():
            NotWords[wordDict[word]] = -1
        elif word in DegreeDict.keys():
            DegreeWords[wordDict[word]] = DegreeDict[word]
    # print(Sentiword)
    # print(Notword)
    # print(Degreeword)
    return SentiWords, NotWords, DegreeWords


def scoreSent(senWord, notWord, degreeWord, segResult):
    # print(senWord)
    # print(notWord)
    # print(degreeWord)
    # print(segResult)
    W = 1
    score = 0
    senLoc = senWord.keys()
    notLoc = notWord.keys()
    degreeLoc = degreeWord.keys()
    senloc = -1
    for i in range(0, len(segResult)):
        if i in senLoc:
            senloc += 1
            score += W * float(senWord[i])
            if senloc < len(senLoc) - 1:
                for j in range((list(senLoc))[senloc], (list(senLoc))[senloc + 1]):
                    if j in list(notLoc):
                        W *= -1
                    elif j in list(degreeLoc):
                        W *= float(degreeWord[j])
        if senloc < len(senLoc) - 1:
            i = (list(senLoc))[senloc + 1]
    return score



import cx_Oracle
def fetch_data(sql):
    conn = cx_Oracle.connect('C##CHINA_GOOD/bishe@127.0.0.1:1521/ORCL')
    cursor = conn.cursor()
    result = cursor.execute(sql)
    all_data = cursor.fetchall()  # 查询全部
    return all_data


good_rates = {}
words_value = words()
# print(words_value[0])
# print(words_value[1])
# print(words_value[2])
# print('喵')
comments_sum = 0

sql1='select GOOD_ID from good_comment GROUP BY GOOD_ID'
result=fetch_data(sql1)
for data in result:
    sql2="select * from good_comment where good_id='%s'"% (data[0])
    zi_data=fetch_data(sql2)

# for i in range(1, 31):
    score_var = []
    # print("
result_jingdong_comment_" + str(i))
    # file_name = 'result_jingdong_comment_' + str(i) + '.txt'
    try:
        # with open(file_name, 'r', encoding='utf-8', errors='ignore') as f:
        #     for line in f.readlines():
        for mi_data in zi_data:
                line=mi_data[2]
                # print(line)
                # print(type(line))
                segList = jieba.cut(line)
                segResult = []
                for w in segList:
                    segResult.append(w)
                # print(segResult)
                with open('stopwords.txt', 'r', encoding='utf-8') as f:
                    stopwords = f.readlines()
                    # print(stopwords)
                    newSent = []
                    for word in segResult:
                        if word + '
' in stopwords:
                            continue
                        else:
                            newSent.append(word)
                    datafen_dist = {}
                    for x in range(0, len(newSent)):
                        datafen_dist[newSent[x]] = x
                    # datafen_dist=listToDist(data)
                    # print(datafen_dist)
                    data_1 = classifywords(datafen_dist, words_value[0], words_value[1], words_value[2])
                    # print('
1
',data_1[0],'
2
',data_1[1],'
3
',data_1[2])
                    segResult_P = []
                    segList_P = jieba.cut(line)
                    for w in segList_P:
                        segResult_P.append(w)
                    data_2 = scoreSent(data_1[0], data_1[1], data_1[2], newSent)
                    # print(data_2)
                    score_var.append(data_2)
        # print(score_var,'

')
        good = 0
        normal = 0
        bad = 0
        for score in score_var:
            if score > 0:
                good = good + 1
            elif score < 0:
                bad = bad + 1
            else:
                normal = normal + 1
        print('good_comments:', good, 'normal_comments:', normal, 'bad_comments:', bad, 'Total_comments:',
              good + normal + bad)
        good_comments_rate = good / (good + normal + bad)
        print(mi_data[1])
        # print('文本评论好评率：%.2f%%' % (good_comments_rate * 100))
        good_rate='%.2f'% (good_comments_rate * 100)
        print(good_rate)
        comments_sum = comments_sum + good + normal + bad
        sql3 = "update goodd set my_good_rate='%s' where id='%s'" % (good_rate, mi_data[1])
        print(sql3)
        conn = cx_Oracle.connect('C##CHINA_GOOD/bishe@127.0.0.1:1521/ORCL')
        cursor = conn.cursor()
        cursor.execute(sql3)
        conn.commit()
        conn.close()
        # print(good_rates)
    except:
        print('不存在！')
代码中所用文件：百度网盘https://pan.baidu.com/s/4lbwlAlT
相关阅读:
CodeForce VKcup A
CNN卷积神经网络
 神经网络
 我的机器学习之路
 [OPENCV] 第一个程序识别颜色
 Android 登录界面与首页的设计
 go web的基本原理
 Go语言标准库之http/template
吞吐量（TPS）、QPS、并发数、响应时间（RT）概念
 数据库恢复技术
原文地址：https://www.cnblogs.com/lovema1210/p/12510809.html