对用户的电影评论进行情感分类
首先输入豆瓣网址,对电影评论进行爬取
然后讲爬取到的评论存入Excel表格
对Excel表格中的数据进行清洗
将数据存入mysql数据库
对数据进行分词,词频统计
调用贝叶斯算法进行情感分类
打印好评和差评
存入数据库
完毕!
# -*-coding:utf-8-*- import urllib.request from bs4 import BeautifulSoup from wordcloud import WordCloud import matplotlib.pyplot as plt #绘制图像的模块 import pandas as pd from numpy import * import jieba import xlwt import codecs from pylab import mpl import os import pymysql pymysql.install_as_MySQLdb() from sqlalchemy import create_engine def getHtml(url): """获取url页面""" headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'} req = urllib.request.Request(url,headers=headers) req = urllib.request.urlopen(req) content = req.read().decode('utf-8') return content def getComment(url): """解析HTML页面""" html = getHtml(url) soupComment = BeautifulSoup(html, 'html.parser') comments = soupComment.findAll('span', 'short') onePageComments = [] for comment in comments: # print(comment.getText()+'\n') onePageComments.append(comment.getText()+'\n') return onePageComments #爬取豆瓣电影评论 def function1(): print("请输入要爬取评论的电影的网址(豆瓣)") url11=input() url1=str(url11) f = open('G:/好评.txt', 'w', encoding='utf-8') for page in range(200): # 豆瓣爬取多页评论需要验证。 ur=url1[33:41] url = 'https://movie.douban.com/subject/'+str(ur)+'/comments?start=' + str( 1 * page) + '&limit=20&sort=new_score&status=P&percent_type=h' print('第%s页的评论:' % (page + 1)) print(url + '\n') for i in getComment(url): f.write(i) print(i) print('\n') f2=open('G:/差评.txt','w',encoding='utf-8') for page2 in range(200): url='https://movie.douban.com/subject/26752088/comments?start=' + str(1*page2) + '&limit=20&sort=new_score&status=P&percent_type=l' print('第%s页的评论:' %(page2+1)) print(url+'\n') for i in getComment(url): f2.write(i) print(i) print('\n') #文本转Excel def function2(): file = open("G:/comments/好评.txt", "r", encoding="UTF-8") a1 = file.readlines() workbook1 = xlwt.Workbook(encoding="UTF-8") worksheet1 = workbook1.add_sheet('ltq') for i in range(len(a1)): worksheet1.write(i, 0, a1[i]) print(a1[i]) workbook1.save('G:/comments/好评.xls') file = open("G:/comments/差评.txt", "r", encoding="UTF-8") a2 = file.readlines() workbook2 = xlwt.Workbook(encoding="UTF-8") worksheet2 = workbook2.add_sheet('ltq') for i in range(len(a2)): worksheet2.write(i, 0, a2[i]) print(a2[i]) workbook2.save('G:/comments/差评.xls') # 贝叶斯对评论进行分类 def function3(): # 第一步 读取数据及分词 # data = pd.read_csv("G:/comments/comments.csv") # print(data) # 取表中的第1列的所有值 print("获取第一列内容") col = data.iloc[:, 1] # 取表中所有值 arrs = col.values # 去除停用词 #stopwords = {}.fromkeys([',', '。', '!', '这', '我', '非常']) stopwords = [line.strip() for line in open("G:/comments/中文停用词表.txt",encoding="utf-8").readlines()] #print("\n中文分词后结果:") corpus = [] for a in arrs: # print a seglist = jieba.cut(a, cut_all=False) # 精确模式 final = '' for seg in seglist: seg = seg.encode('utf-8') if seg not in stopwords: # 不是停用词的保留 final += seg.decode() seg_list = jieba.cut(final, cut_all=False) output = ' '.join(list(seg_list)) # 空格拼接 #print(output) corpus.append(output) #################################### # 第二步 计算词频 # from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer vectorizer = CountVectorizer() # 将文本中的词语转换为词频矩阵 X = vectorizer.fit_transform(corpus) # 计算个词语出现的次数 word = vectorizer.get_feature_names() # 获取词袋中所有文本关键词 for w in word: # 查看词频结果 print(w) print('') print(X.toarray()) #################################### # 第三步 数据分析 from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import precision_recall_curve from sklearn.metrics import classification_report # 使用前4000行数据集进行训练,最后n数据集用于预测 print(u"\n\n数据分析:") X = X.toarray() x_train = X[:4000] x_test = X[4000:] # 1表示好评 0表示差评 # y_train = [1,1,0,0,1,0,0,1] y_train1 = data['Fraction'].tolist() y_train2 = y_train1[:4000] y_train = array(y_train2) print(y_train) y_test = [1, 0] # 调用MultinomialNB分类器 clf = MultinomialNB().fit(x_train, y_train) pre = clf.predict(x_test) print("1表示好评,0表示差评") print("评论预测结果为:_____________________________________________________________________________________________________________________________________________") com_list1 = data['comment'].tolist() com_list2 = com_list1[4000:] j = 0 for i in com_list2: print(i, " :", pre[j]) # print(pre[j]) j = j + 1 # 输出好评: print("是否输出全部好评? 1:是 0:否") selectone=input() selectone=int(selectone) if selectone==1: print("查看所有好评_________________________________________________________________________________________________________________________________________________") j = 0 for i in com_list2: pre[j] = int(pre[j]) if pre[j] == 1: print(i) j = j + 1 #输出差评: print("是否输出所有差评? 1:是 0:否") selectsecond=input() selectsecond=int(selectsecond) if selectsecond==1: print("查看所有差评—————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————") j = 0 for i in com_list2: pre[j] = int(pre[j]) if pre[j] == 0: print(i) j = j + 1 # print(u"预测结果:",pre) # print(u"真实结果:",y_test) from sklearn.metrics import classification_report print(classification_report(y_test, pre)) # 生成直方图 def function4(): mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体 mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 plt.rcParams['font.sans-serif'] = ['SimHei'] txt = open("G:/差评.txt", encoding="utf-8").read() # 加载停用词表 stopwords = [line.strip() for line in open("G:/comments/中文停用词表.txt", encoding="utf-8").readlines()] words = jieba.lcut(txt) counts = {} for word in words: # 不在停用词表中 if word not in stopwords: # 不统计字数为一的词 if len(word) == 1: continue else: counts[word] = counts.get(word, 0) + 1 items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) for i in range(50): word, count = items[i] print("{:<10}{:>7}".format(word, count)) label = list(map(lambda x: x[0], items[:10])) value = list(map(lambda y: y[1], items[:10])) plt.bar(range(len(value)), value, tick_label=label) plt.savefig("G:/filename.png") # plt.show() # 打开数据库连接 # 读取图片文件 # fp = open("test.jpg",'rb',encoding='utf-8') fp = open("G:/filename.png", 'rb') img = fp.read() fp.close() db = pymysql.connect("localhost", "root", "liutaiqing", "testdb", charset='utf8') # 使用cursor()方法获取操作游标 cursor = db.cursor() # 注意使用Binary()函数来指定存储的是二进制 # cursor.execute("INSERT INTO demo_pic_repo SET touxiang_data= %s" % pymysql.Binary(img)) sql = "INSERT INTO demo_pic_repo (touxiang_data_blob) VALUES (%s)" cursor.execute(sql, img) # 提交,不然无法保存新建或者修改的数据 db.commit() # 关闭游标 cursor.close() # 关闭数据库连接 db.close() #生成词云 def function5(): # 差评词云 path_txt2 = 'G:/comments/差评.txt' f2 = open(path_txt2, 'r', encoding='UTF-8').read() cut_text2 = " ".join(jieba.cut(f2)) wordcloud2 = WordCloud( # 设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的 font_path="C:/Windows/Fonts/simfang.ttf", # 设置了背景,宽高 background_color="white", width=1100, height=1000).generate(cut_text2) plt.imshow(wordcloud2, interpolation="bilinear") plt.axis("off") # plt.show() wordcloud2.to_file("G:/词云图片.jpg") # 打开数据库连接 # 读取图片文件 # fp = open("test.jpg",'rb',encoding='utf-8') fp = open("G:/词云图片.jpg", 'rb') img = fp.read() fp.close() db = pymysql.connect("localhost", "root", "liutaiqing", "testdb", charset='utf8') # 使用cursor()方法获取操作游标 cursor = db.cursor() # 注意使用Binary()函数来指定存储的是二进制 # cursor.execute("INSERT INTO demo_pic_repo SET touxiang_data= %s" % pymysql.Binary(img)) sql = "INSERT INTO demo_pic_repo (touxiang_data_blob) VALUES (%s)" cursor.execute(sql, img) # 提交,不然无法保存新建或者修改的数据 db.commit() # 关闭游标 cursor.close() # 关闭数据库连接 db.close() def function6(): #好评词云 path_txt1 = 'G:/comments/好评.txt' f1 = open(path_txt1, 'r', encoding='UTF-8').read() # 结巴分词,生成字符串,wordcloud无法直接生成正确的中文词云 cut_text1 = " ".join(jieba.cut(f1)) wordcloud1 = WordCloud( # 设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的 font_path="C:/Windows/Fonts/simfang.ttf", # 设置了背景,宽高 background_color="white", width=1100, height=1000).generate(cut_text1) plt.imshow(wordcloud1, interpolation="bilinear") plt.axis("off") # plt.show() wordcloud1.to_file("G:/词云图片.jpg") # 打开数据库连接 # 读取图片文件 # fp = open("test.jpg",'rb',encoding='utf-8') fp = open("G:/词云图片.jpg", 'rb') img = fp.read() fp.close() db = pymysql.connect("localhost", "root", "liutaiqing", "testdb", charset='utf8') # 使用cursor()方法获取操作游标 cursor = db.cursor() # 注意使用Binary()函数来指定存储的是二进制 # cursor.execute("INSERT INTO demo_pic_repo SET touxiang_data= %s" % pymysql.Binary(img)) sql = "INSERT INTO demo_pic_repo (touxiang_data_blob) VALUES (%s)" cursor.execute(sql, img) # 提交,不然无法保存新建或者修改的数据 db.commit() # 关闭游标 cursor.close() # 关闭数据库连接 db.close()
这是本人大二写的课题,从python零基础到课题完成,大概经历了一个月。如果疑问请评论,如果感觉有帮助请三连支持,多谢!!!