python scrapy 豆瓣爬虫及词云

没事弄着玩的，爬取的是电影《流浪猫鲍勃》的电影评价，说是有1W多评价，实际只有500条左右，估计是引用的也算进去了

用的是python scrapy框架，安装部分就省略了

import time

import scrapy
from scrapy.selector import Selector
from ..items import DoubanItem

# 模拟请求头
headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}


class QuotesSpider(scrapy.Spider):
    name = "quotes"
    allowed_domains = ['movie.douban.com']
    start_urls = ['https://movie.douban.com/subject/26685451/comments?start=500&limit=20&status=P&sort=new_score']
    number = 0

    def parse(self, response):
        item = DoubanItem()
        for v in response.xpath(
                '//div[@class="comment-item "]/div[@class="comment"]'):
            item['name'] = v.xpath('h3/span[@class="comment-info"]/a/text()').get()
            item['time'] = str.strip(
                v.xpath('h3/span[@class="comment-info"]/span[@class="comment-time "]/text()').get())
            item['evaluate'] = v.xpath('p[@class=" comment-content"]/span[@class="short"]/text()').get()
            item['star'] = v.css("span").xpath('@title').get()
            yield item
        next_link_end = response.xpath("//div[@class='center']/a[@class='next']/@href").get()
        next_link = response.xpath("//div[@class='center']/a[@class='next']/text()").get()
        if next_link == '后页 >':
            time.sleep(1)
            self.number = self.number + 20
            next_like = 'https://movie.douban.com/subject/26685451/comments' + next_link_end
            yield scrapy.Request(url=next_like, callback=self.parse, headers={
                'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
                "Referer": "https://movie.douban.com/subject/26685451/comments?start={}&limit=20&status=P&sort=new_score".format(
                    self.number),
                "Cookie": '你的cookie'})  # 豆瓣有限制，没登录只等爬取200条左右的数据

items.py文件

import scrapy
class DoubanItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    time = scrapy.Field()
    star = scrapy.Field()
    evaluate = scrapy.Field()
    pass



　启动命令 scrapy crawl quotes -o list.csv 直接保存为list.csv文件


后续我把文件存进了数据库，通过数据库读取的

#! /usr/bin/env python
# -*- coding:utf-8 -*-
import pymysql, jieba, re
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

conn = pymysql.connect(host='*', user='root', passwd="*", db='demo', port=3306, charset='utf8')
cur = conn.cursor(cursor=pymysql.cursors.DictCursor)
sql = "select * from douban"
cur.execute(sql)
cur.close()
conn.close()
# 将列表中的数据转换为字符串
allComment = ''
for v in cur.fetchall():
    allComment = allComment + v['evaluate'].strip(" ")

# 使用正则表达式去除标点符号
pattern = re.compile(r'[u4e00-u9fa5]+')
filterdata = re.findall(pattern, allComment)
cleaned_comments = ''.join(filterdata)

# 使用结巴分词进行中文分词
segment = jieba.lcut(cleaned_comments)
comment = pd.DataFrame({'segment': segment})

# 去掉停用词 chineseStopWords.txt 自己网上下m,qu 
stopwords = pd.read_csv("./chineseStopWords.txt", index_col=False, quoting=3, sep="	",
                        names=['stopword'], encoding='GBK')

comment = comment[~comment.segment.isin(stopwords.stopword)]

# 统计词频
comment_fre = comment.groupby(by='segment').agg(
    计数=pd.NamedAgg(column='segment', aggfunc='size')).reset_index().sort_values(
    by='计数', ascending=False)
# 用词云进行显示

wordcloud = WordCloud(
    font_path="你的文件地址/simhei.ttf",
    background_color="white", max_font_size=80)
word_frequence = {x[0]: x[1] for x in comment_fre.head(1000).values}

word_frequence_list = []
for key in word_frequence:
    temp = (key, word_frequence[key])
    word_frequence_list.append(temp)
wordcloud = wordcloud.fit_words(dict(word_frequence_list))
plt.imshow(wordcloud)
plt.show()


最终结果

相关阅读:
tomcat 登录主页成功点击Manager App 401 等问题
 servlet
jsp 记录1 bs/cs
java jar
Java 第四课对象类
 java 第五课异常
 Java 第三课数组排序
 java 第二课标识符
 java 第一课笔记
 maven 项目问题集锦
原文地址：https://www.cnblogs.com/MyIsLu/p/14200929.html