• Scrapy尝试爬取微博热搜(初稿)


      首先自己想要的item:

     1 import scrapy
     2 
     3 
     4 class WeiboItem(scrapy.Item):
     5 
     6     rank = scrapy.Field()
     7     title = scrapy.Field()
     8     hot_totle = scrapy.Field()
     9     tag_pic = scrapy.Field()
    10     watch = scrapy.Field()
    11     talk = scrapy.Field()
    12     weibo_detail = scrapy.Field()
    13     bozhu = scrapy.Field()
    14     biaoqian = scrapy.Field()
    15     time = scrapy.Field()

    主要spider:记住referer!!!搜索详情页面会判断来源

      1 import scrapy
      2 import logging
      3 import json
      4 logger = logging.getLogger(__name__)
      5 from weibo.items import WeiboItem
      6 # from weibo.settings import MYSQL_HOST
      7 import datetime
      8 """
      9 class=icon-top     是置顶 判断其是否是买的  看后面的 icon-txt 如果是icon-txt-recommend推荐
     10 """
     11 class WeiboResouSpider(scrapy.Spider):
     12     name = 'weibo-resou'    # 爬虫的名字
     13     allowed_domains = ['weibo.com'] # 允许爬取的范围
     14     start_urls = ['http://s.weibo.com/top/summary/']  # 热搜排行榜 
     15     detail_url = 'https://s.weibo.com/weibo?q=%23{}%23&Refer=top'    # 搜索页面
     16     tag_url = 'https://s.weibo.com/top/summary/summary?cate=socialevent'  # tag
     17 
     18     headers = {
     19         "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36",
     20         "Referer" : "https://s.weibo.com/"  
     21         }
     22 
     23     Cookie = {
     24         'SCF' : 'XXXX',
     25         'SUB' : 'XXXX',
     26         'SUBP' : 'XXXX',
     27         'SUHB' : 'XXXX',
     28         'ALF' : 'XXXX',
     29         'SSOLoginState' : 'XXXXX'}
     30 
     31 
     32 
     33     
     34     def parse(self, response):
     35         # .extract  返回包含字符串数据的列表  .extract_first() 返回第一个
     36         # 这里yield 返回可以是Request , BaseItem , 字典 或 None
     37         # 此处yield 即可在pipeline里操作 生成器返回的值,或数据
     38         # yield 返回的化就可 每次 循环都返回一个
     39         # log 日志获取错误(主要为了定位
     40         # 注意看如果图片链接前有 网站的域名 需要在获取到img 后进行拼接!列表推导式
     41         # 如果要 循环调用 ,记得yield scrapu.Request 这个自己的parse url记得更新
     42         # self.settings['MYSQL_HOST']
     43         # self.settings.get('MYSQL_HOST')
     44         logging.warning('this is spider warning')
     45         tr_list = response.xpath('//*[@id="pl_top_realtimehot"]/table//tr')
     46         for tr in tr_list[1:]:
     47             item = WeiboItem()
     48             # item = {}
     49             # 排名
     50             if tr.xpath('./td[1]/text()').extract_first() is None:
     51                 item['rank'] = '置顶向'
     52             else:
     53                 item['rank'] = tr.xpath('./td[1]/text()').extract_first()
     54             # 获取热搜标题
     55             if tr.xpath('./td[2]/a//text()').extract_first() is None:
     56                 item['title'] = '找不到热搜哦'
     57             else:
     58                 item['title'] = tr.xpath('./td[2]/a//text()').extract_first()
     59 
     60 
     61             # 判断这里是否有热度
     62 
     63             if tr.xpath('./td[2]/span/text()').extract_first() is None:
     64                 item['hot_totle'] = '找不到热度哟'
     65             else:
     66                 item['hot_totle'] = tr.xpath('./td[2]/span/text()').extract_first()
     67             # 获取热搜tag
     68             if not tr.xpath('./td[3]//i/text()').extract_first():
     69                 item['tag_pic'] = '没有热度标签'
     70             else:
     71                 item['tag_pic'] = tr.xpath('./td[3]//i/text()').extract_first()
     72             # print(item)
     73             title_search = item['title']
     74             # print(item)
     75             yield scrapy.Request(
     76                 url=self.detail_url.format(title_search),
     77                 meta={'item':item},
     78                 headers=self.headers,   # 非常关键!一定要referer是微博的来访!
     79                 cookies=self.Cookie,
     80                 callback=self.search_detail
     81             )
     82     def search_detail(self,response):
     83         item = response.meta['item']
     84 
     85         if response.xpath('//div[@class="total"]/span[1]/text()').extract_first() is None:
     86             item['watch'] = '阅读量爆炸'
     87         else:
     88             item['watch'] = response.xpath('//div[@class="total"]/span[1]/text()').extract_first()
     89         # print(item)
     90         if response.xpath('//div[@class="total"]/span[2]/text()').extract_first() is None:
     91             item['talk'] = '讨论量爆炸'
     92         else:
     93             item['talk'] = response.xpath('//div[@class="total"]/span[2]/text()').extract_first()
     94         # print(item)
     95         # 获取具体消息 会有置顶 当事方 热门 这里读取
     96 
     97         page_row = response.xpath('//div[@class="content"]')
     98         # print(page_row)
     99         for detail in page_row:
    100             item['weibo_detail'] = detail.xpath('//p[@class="txt"]/text()').extract()
    101             item['bozhu'] = detail.xpath('//div[@class="content"]//a[@class="name"]/text()').extract()
    102         # print(item['bozhu'])
    103             # 置顶标签
    104             if detail.xpath('//div[@class="card-wrap"]//h4[@class="title"]/a/text()') is None:
    105                 item['biaoqian'] = '普通微博哟'
    106             else:
    107                 item['biaoqian'] = detail.xpath('//div[@class="card-wrap"]//h4[@class="title"]/a/text()').extract_first()
    108             # print(item['biaoqian'])
    109 
    110             item['time'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
    111 
    112 
    113         yield item

    管道:

    这里很粗糙,博文被js修饰了分隔没法一条一条,有待改进

    这里先保存成csv 做数据分析,也可入库

     1 import csv
     2 
     3 class WeiboPipeline:
     4     # 可以通过spider 来判断 我这里是哪个爬虫  如果是哪个爬虫再执行 可用if 判断是从那个爬虫来的
     5     # 这样区分后可以分别存到 Mongodb 或者 Mysql里面
     6     # process_item 名不能改
     7     # 可以在这里 被传给这里后进行处理 用 content = [i.replace('替换',"") for i in content
     8     # 取空内容 content = [i.strip() for i in content]  去空格
     9     #        content = [i for i in content if len(i)>0]
    10     # 上面的处理内容 也可以:
    11     # content = [re.sub(r'xa0|s|
    ', "", i) for i in content]
    12     # content = [i for i in content if len(i)>0]
    13     def __init__(self):
    14 
    15         self.file = open('resou.csv','w',encoding='utf-8-sig',newline='')
    16         self.writer = csv.writer(self.file)
    17         self.head = ['热搜名称','热度','讨论','阅读量','记录时间','热搜标签','排名']
    18         self.writer.writerow(self.head)
    19 
    20     def process_item(self, item, spider):
    21         # spider.settings.get('MYSQL_HOST')
    22         item['weibo_detail'] = self.parse_content(item['weibo_detail'])
    23         print(item)
    24         self.writer.writerow([item['title'],item['hot_totle'],item['talk'],item['watch'],item['time'],item['tag_pic'],item['rank']])
    25         return item
    26 
    27     def parse_content(self,content):
    28         content = [i.replace('u200b',"") for i in content]
    29         content = [i.replace('
    ',"") for i in content]
    30         content = [i.replace('', "") for i in content]
    31         content = [i.replace('', "") for i in content]
    32         content = [i.strip() for i in content]
    33         content = [i for i in content if len(i) > 0]
    34         # print(content)
    35         return content
    36 
    37     def close_spider(self, spider):
    38         self.file.close()

    ps:!!可以使用进程  循环调取! emm 发现问题 咋这里 应该再主调用外面取写 csv 头,要不然重复 写入自己的 标签头了!!

    这里以及可以完成定时关闭和永远循环执行,如果服务器想要执行爬虫文件最好还是执行这个文件。此处循环  定300s 

     1 from multiprocessing import Process
     2 from scrapy import cmdline
     3 import time
     4 import logging
     5 import datetime
     6 # 配置参数即可, 爬虫名称,运行频率
     7 confs = [
     8     {
     9         "spider_name": "weibo-resou",
    10         "frequency": 120,
    11     },
    12 ]
    13 
    14 def start_spider(spider_name, frequency,):
    15     args = ["scrapy", "crawl", spider_name]
    16     start_time = time.time()
    17 
    18 
    19     while True:
    20 
    21         start = time.time()
    22         p = Process(target=cmdline.execute, args=(args,))
    23         p.start()
    24         p.join()
    25         logging.debug("### use time: %s" % (time.time() - start))
    26         during_time = (time.time() - start_time)
    27         print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "正在爬取微博热搜哟")
    28         if during_time > 300:
    29             print("爬取结束哟")
    30             break
    31         else:
    32             time.sleep(frequency)
    33             print(during_time)
    34 
    35 
    36 
    37 if __name__ == '__main__':
    38     for conf in confs:
    39 
    40         process = Process(target=start_spider, args=(conf["spider_name"], conf["frequency"]))
    41         process.start()
    42 
    43         start_time = time.time()
    44         during_time = (time.time() - start_time)
    45 
    46         if during_time > 300:
    47             process.terminate()
    48             break
    49         # time.sleep(86400)

    停止时间:settings里面

    CLOSESPIDER_TIMEOUT = 秒数

    哈哈接下来就是处理数据咯

  • 相关阅读:
    如何控制input框!
    火车头采集器破解版
    记Angular与Django REST框架的一次合作(2):前端组件化——Angular
    拉勾网一些“震惊”的结论
    一个知乎重度用户眼中的知乎
    anthelion编译
    搜索引擎爬虫蜘蛛的USERAGENT大全
    Netty系列之Netty高性能之道
    python正则表达式
    Cookie的使用,详解,获取,无法互通、客户端获取Cookie、深入解析cookie
  • 原文地址:https://www.cnblogs.com/sdosun/p/13305701.html
Copyright © 2020-2023  润新知