• scrapy 分布式爬虫- RedisSpider


    爬去当当书籍信息

    多台机器同时爬取,共用一个redis记录 scrapy_redis

    带爬取的request对象储存在redis中,每台机器读取request对象并删除记录,经行爬取。实现分布式爬虫

    import scrapy
    from scrapy_redis.spiders import RedisSpider
    from copy import deepcopy
    
    
    class DangdangSpider(RedisSpider):
        name = 'dangdang'
        allowed_domains = ['dangdang.com']
        # 开始爬虫,会从redis的key中读取start_url.
        redis_key = "dangdang" # lpush dangdang 'http://book.dangdang.com/'
    
        def parse(self, response):
            # 大分类
            div_list = response.xpath("//div[@class='con flq_body']/div")[:-4]
            print(len(div_list), 'duoshao')
            for div in div_list:
                item = {}
                item['b_cate'] = div.xpath("./dl/dt//text()").extract()
                item['b_cate'] = [i.strip() for i in item['b_cate'] if len(i.strip())>0] # 过滤掉空字符
                print('b_cate:', item['b_cate'])
                # 中间分类
                if item['b_cate'] == ['创意文具']:
                    print(item['b_cate'], "pass......")
                    item['m_cate'] = None
                    item['s_cate_url'] = div.xpath("./dl/dt/a/@ddt-src").extract_first()
                    print('s_cate_url:', item['m_cate'])
                    # yield scrapy.Request(
                    #     item['s_cate_url'],
                    #     callback=self.parse_special,
                    #     meta={'item': deepcopy(item)}
                    # )
                else:
                    dl_list = div.xpath(".//dl[@class='inner_dl']")
                    for dl in dl_list:
                        item['m_cate'] = dl.xpath("./dt//text()").extract()
                        item['m_cate'] = [i.strip() for i in item['m_cate'] if len(i.strip())>0]
                        # 小分类
                        dd_list = dl.xpath("./dd")
                        for dd in dd_list:
                            item['s_cate'] = dd.xpath("./a/@title").extract_first()
                            item['s_cate_url'] = dd.xpath("./a/@ddt-src").extract_first()
                            # 小分类的所有书籍
                            if item['s_cate_url'] is not None:
                                yield scrapy.Request(
                                    item['s_cate_url'],
                                    callback=self.parse_books,
                                    meta={'item': deepcopy(item)}
                                )
    
        def parse_special(self, response):
            ''' 文具信息 '''
            pass
    
        def parse_books(self, response):
            item = response.meta['item']
            # 当前小分类的书籍
            li_list = response.xpath("//ul[@class='list_aa ']/li")
            if li_list is not None:
                for li in li_list:
                    try:
                        item['book_price'] = li.xpath(".//span[@class='num']/text()").extract_first() + 
                                             li.xpath(".//span[@class='tail']/text()").extract_first()
                    except:
                        item['book_price'] = 'Unknown'
                    item['book_url'] = li.xpath("./a/@href").extract_first()
                    if item['book_url'] is not None:
                        yield scrapy.Request(
                            item['book_url'],
                            callback=self.parse_book_detail,
                            meta={'item': deepcopy(item)}
                        )
    
        def parse_book_detail(self, response):
            item = response.meta['item']
            item['book_name'] = response.xpath("//div[@class='name_info']/h1/img/text()").extract_first()
            item['book_desc'] = response.xpath("//span[@class='head_title_name']/text()").extract_first()
            # 这一本书籍的详细信息
            span_list = response.xpath("//div[@class='messbox_info']/span")
            item['book_author'] = span_list.xpath("./span[1]/a/text()").extract() # 可能多个作者
            item['publisher'] = span_list.xpath("./span[2]/a/text()").extract_first()
            item['pub_date'] = span_list.xpath("./span[3]/text()").extract_first()
            print(item)
            # yield item
  • 相关阅读:
    第十四节、FAST角点检测(附源码)
    第三十六节,目标检测之yolo源码解析
    《理财市场情绪监测系统》代码实现【1】之行业词库
    python numpy 下载地址
    hive 添加自增列
    excel做回归分析的应用【风控数据分析】
    SELECT a.loginname,a.deviceid,a.time,Row_Number() OVER (partition by a.loginname ORDER BY a.deviceid desc,a.time asc) rank
    hive cst 时间转换
    hive以文件创建表
    以当前日期命名或复制文件夹
  • 原文地址:https://www.cnblogs.com/tangpg/p/10845174.html
Copyright © 2020-2023  润新知