• 增量式爬虫


     - 当我们在浏览相关网页的时候会发现,某些网站定时会在原有网页数据的基础上更新一批数据,例如某电影网站会实时更新一批最近热门的电影。小说网站会根据作者创作的进度实时更新最新的章节数据等等

     - 增量式爬虫就是通过爬虫程序监测某网站数据更新的情况,以便可以爬取到该网站更新出的新数据

     - 如何进行增量式的爬取工作:

       - 1)在发送请求之前判断这个URL是不是之前爬取过

       - 2)在解析内容后判断这部分内容是不是之前爬取过

       - 3)写入存储介质时判断内容是不是已经在介质中存在

     - 增量式爬取核心:去重

    - 爬取电影数据

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from redis import Redis
    from increment1_pro.items import Increment1ProItem
    class MovieSpider(CrawlSpider):
        name = 'movie'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/7.html']
    
        rules = (
            Rule(LinkExtractor(allow=r'/index.php/vod/show/id/7/page/d+.html'), callback='parse_item', follow=True),
        )
    
        def parse_item(self, response):
            conn = Redis(host="127.0.0.1",port=6379)
            detail_url_list = response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]/div/a/@href').extract()
            for url in detail_url_list:
                url = 'https://www.4567tv.tv' + url
                ex = conn.sadd("movies_url",url)
                if ex == 1:
                    yield scrapy.Request(url=url,callback=self.parse_detail)
                else:
                    print("网站数据暂无更新")
    
        def parse_detail(self, response):
            item = Increment1ProItem()
            item['name'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
            item['actor'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[3]/a/text()').extract_first()
    
            yield item
    movie
    from redis import Redis
    class Increment1ProPipeline(object):
        conn = None
        def open_spider(self, spider):
            self.conn = Redis(host="127.0.0.1",port=6379)
        def process_item(self, item, spider):
            dic = {
                "name":item["name"],
                "actor":item["actor"]
            }
            print("正在爬取新数据入库")
            self.conn.lpush("movie_data",item)
            return item
    pipelines.py
    import scrapy
    
    class Increment1ProItem(scrapy.Item):
        # define the fields for your item here like:
        name = scrapy.Field()
        actor = scrapy.Field()
    items

      - 爬取糗事  自制数据指纹

    # -*- coding: utf-8 -*-
    import scrapy
    import hashlib
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from redis import Redis
    from increment2_pro.items import Increment2ProItem
    
    
    class QiubaiSpider(CrawlSpider):
        name = 'qiubai'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['https://www.qiushibaike.com/text/']
    
        rules = (
            Rule(LinkExtractor(allow=r'/text/page/d+/'), callback='parse_item', follow=True),
        )
    
        def parse_item(self, response):
            div_list = response.xpath(
                '//div[@class="article block untagged mb15 typs_hot"] | div[@class="article block untagged mb15 typs_old"]')
            conn = Redis(host="127.0.0.1", port=6379)
            for div in div_list:
                item = Increment2ProItem()
                content = div.xpath('.//div[@class="content"]/span/text()').extract()
                item["content"] = "".join(content)
                item["author"] = div.xpath('./div/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first()
                source = item["author"] + item["content"]
    
                # 自制数据指纹
                hashValue = hashlib.sha3_256(source.encode()).hexdigest()
    
                ex = conn.sadd("qiubai_hash",hashValue)
                if ex == 1:
                    yield item
                else:
                    print("暂无数据可以爬取")
    qiubai
    from redis import Redis
    class Increment2ProPipeline(object):
        conn = None
    
        def open_spider(self, spider):
            self.conn = Redis(host='127.0.0.1', port=6379)
        def process_item(self, item, spider):
            dic = {
                "author":item["author"],
                "content":item["content"]
            }
            print("正在爬取数据...")
            self.conn.lpush("qiubai_data",dic)
            return item
    pipelines.py
    import scrapy
    
    class Increment2ProItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        content = scrapy.Field()
        author = scrapy.Field()
    items
  • 相关阅读:
    scala环境配置+hello world!
    mysql无法登录
    ajax跨域
    jfinal框架页面找不到相关css,js文件404
    从程序员到CTO的Java技术路线图 作者:zz563143188
    在項目中快速部署SLF4J+LOGBACK
    Spring整合MyBatis
    SpringMVC常用注解實例詳解3:@ResponseBody
    浏览器包含哪些进程?
    前端程序员也需要知道进程和线程
  • 原文地址:https://www.cnblogs.com/lzmdbk/p/10478394.html
Copyright © 2020-2023  润新知