#本程序简单地使用分词、然后去停用词,对评价中带有正向和负向的词进行统计,以及对程度副词进行分析,得出商品的好评率
import re import time import random import jieba from collections import defaultdict import chardet def words(): with open('BosonNLP_sentiment_score.txt', 'r', encoding='utf-8') as f: sentiList = f.readlines() # print(len(sentiList)) SentiDict = defaultdict() # SentiDict=defaultdict() for s in sentiList: # print(s) s = s.strip(' ') # print(s) # print(s.split(' ')[0]) # print(s.split(' ')[1]) # print(' ') try: SentiDict[s.split(' ')[0]] = s.split(' ')[1] except: pass print(len(SentiDict)) with open('NoList.txt', 'r', encoding='utf-8') as f: NotList = f.readlines() NotList2 = [] for line in NotList: line = line.strip(' ') # print(line) NotList2.append(line) # print(NotList2) print(len(NotList2)) with open('DegreeList.txt', 'r', encoding='utf-8') as f: DegreeList = f.readlines() DegreeDict = defaultdict() # DegreeDict=defaultdict() n = 0 Degree = [0, 2, 1.25, 1.2, 0.8, 0.5, 1.5] for d in DegreeList: d = d.strip(' ') # print(d) cout = re.findall('”.*?(d+)', d) if len(cout): # print(cout) n = n + 1 continue if n > 0: DegreeDict[d] = Degree[n] print(len(DegreeDict)) # 少了四个!!! return SentiDict, NotList2, DegreeDict def classifywords(wordDict, SentiDict, NotList, DegreeDict): SentiWords = defaultdict() NotWords = defaultdict() DegreeWords = defaultdict() # print(wordDict) for word in wordDict.keys(): if word in SentiDict.keys() and word not in NotList and word not in DegreeDict.keys(): SentiWords[wordDict[word]] = SentiDict[word] elif word in NotList and word not in DegreeDict.keys(): NotWords[wordDict[word]] = -1 elif word in DegreeDict.keys(): DegreeWords[wordDict[word]] = DegreeDict[word] # print(Sentiword) # print(Notword) # print(Degreeword) return SentiWords, NotWords, DegreeWords def scoreSent(senWord, notWord, degreeWord, segResult): # print(senWord) # print(notWord) # print(degreeWord) # print(segResult) W = 1 score = 0 senLoc = senWord.keys() notLoc = notWord.keys() degreeLoc = degreeWord.keys() senloc = -1 for i in range(0, len(segResult)): if i in senLoc: senloc += 1 score += W * float(senWord[i]) if senloc < len(senLoc) - 1: for j in range((list(senLoc))[senloc], (list(senLoc))[senloc + 1]): if j in list(notLoc): W *= -1 elif j in list(degreeLoc): W *= float(degreeWord[j]) if senloc < len(senLoc) - 1: i = (list(senLoc))[senloc + 1] return score import cx_Oracle def fetch_data(sql): conn = cx_Oracle.connect('C##CHINA_GOOD/bishe@127.0.0.1:1521/ORCL') cursor = conn.cursor() result = cursor.execute(sql) all_data = cursor.fetchall() # 查询全部 return all_data good_rates = {} words_value = words() # print(words_value[0]) # print(words_value[1]) # print(words_value[2]) # print('喵') comments_sum = 0 sql1='select GOOD_ID from good_comment GROUP BY GOOD_ID' result=fetch_data(sql1) for data in result: sql2="select * from good_comment where good_id='%s'"% (data[0]) zi_data=fetch_data(sql2) # for i in range(1, 31): score_var = [] # print(" result_jingdong_comment_" + str(i)) # file_name = 'result_jingdong_comment_' + str(i) + '.txt' try: # with open(file_name, 'r', encoding='utf-8', errors='ignore') as f: # for line in f.readlines(): for mi_data in zi_data: line=mi_data[2] # print(line) # print(type(line)) segList = jieba.cut(line) segResult = [] for w in segList: segResult.append(w) # print(segResult) with open('stopwords.txt', 'r', encoding='utf-8') as f: stopwords = f.readlines() # print(stopwords) newSent = [] for word in segResult: if word + ' ' in stopwords: continue else: newSent.append(word) datafen_dist = {} for x in range(0, len(newSent)): datafen_dist[newSent[x]] = x # datafen_dist=listToDist(data) # print(datafen_dist) data_1 = classifywords(datafen_dist, words_value[0], words_value[1], words_value[2]) # print(' 1 ',data_1[0],' 2 ',data_1[1],' 3 ',data_1[2]) segResult_P = [] segList_P = jieba.cut(line) for w in segList_P: segResult_P.append(w) data_2 = scoreSent(data_1[0], data_1[1], data_1[2], newSent) # print(data_2) score_var.append(data_2) # print(score_var,' ') good = 0 normal = 0 bad = 0 for score in score_var: if score > 0: good = good + 1 elif score < 0: bad = bad + 1 else: normal = normal + 1 print('good_comments:', good, 'normal_comments:', normal, 'bad_comments:', bad, 'Total_comments:', good + normal + bad) good_comments_rate = good / (good + normal + bad) print(mi_data[1]) # print('文本评论好评率:%.2f%%' % (good_comments_rate * 100)) good_rate='%.2f'% (good_comments_rate * 100) print(good_rate) comments_sum = comments_sum + good + normal + bad sql3 = "update goodd set my_good_rate='%s' where id='%s'" % (good_rate, mi_data[1]) print(sql3) conn = cx_Oracle.connect('C##CHINA_GOOD/bishe@127.0.0.1:1521/ORCL') cursor = conn.cursor() cursor.execute(sql3) conn.commit() conn.close() # print(good_rates) except: print('不存在!')
代码中所用文件:百度网盘https://pan.baidu.com/s/4lbwlAlT