• 21天打造分布式爬虫-Spider类爬取糗事百科(七)


    7.1.糗事百科

    安装

    pip install pypiwin32

    pip install Twisted-18.7.0-cp36-cp36m-win_amd64.whl

    pip install scrapy

    创建和运行项目

    scrapy startproject qsbk   #创建项目
    
    scrapy genspider qsbk_spider "qiushibaike.com"   #创建爬虫
    
    scrapy crawl qsbk_spider         #运行爬虫

    代码

    qsbk_spider.py

    # -*- coding: utf-8 -*-
    import scrapy
    from qsbk.items import QsbkItem
    
    class QsbkSpiderSpider(scrapy.Spider):
        name = 'qsbk_spider'
        allowed_domains = ['qiushibaike.com']
        start_urls = ['https://www.qiushibaike.com/8hr/page/1/']
        base_domain = "https://www.qiushibaike.com"
    
        def parse(self, response):
            duanzidivs = response.xpath("//div[@id='content-left']/div")
            for duanzidiv in duanzidivs:
                author = duanzidiv.xpath(".//h2/text()").get().strip()
                content = duanzidiv.xpath(".//div[@class='content']//text()").getall()
                content = "".join(content).strip()
                item = QsbkItem(author=author,content=content)
                yield item
            #爬后面页的数据
            next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get()
            if not next_url:
                return
            else:
                yield scrapy.Request(self.base_domain+next_url,callback=self.parse)

    item.py

    import scrapy
    
    class QsbkItem(scrapy.Item):
        author = scrapy.Field()
        content = scrapy.Field()

    pipelines.py

    # -*- coding: utf-8 -*-
    
    import json
    
    #1.手动把dick转换成json格式
    
    # class QsbkPipeline(object):
    #     def __init__(self):
    #         self.fp = open('duanzi.json','w',encoding='utf-8')
    #
    #     def open_spider(self,spider):
    #         print('开始爬虫')
    #
    #     def process_item(self, item, spider):
    #         item_json = json.dumps(dict(item),ensure_ascii=False)
    #         self.fp.write(item_json+'
    ')
    #         return item
    #
    #     def close_spider(self,spider):
    #         self.fp.close()
    #         print('爬虫结束了')
    
    #2.适用JsonItemExporter,使用与数据量小的情况下
    # from scrapy.exporters import JsonItemExporter
    # class QsbkPipeline(object):
    #     def __init__(self):
    #         self.fp = open('duanzi.json','wb')
    #         self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
    #         self.exporter.start_exporting()
    #
    #     def open_spider(self,spider):
    #         print('开始爬虫')
    #
    #     def process_item(self, item, spider):
    #         self.exporter.export_item(item)
    #         return item
    #
    #     def close_spider(self,spider):
    #         self.exporter.finish_exporting()
    #         self.fp.close()
    #         print('爬虫结束了')
    
    
    #3.JsonLinesItemExporter,适用与数据量大的情况下
    from scrapy.exporters import JsonLinesItemExporter
    class QsbkPipeline(object):
        def __init__(self):
            self.fp = open('duanzi.json','wb')
            self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
    
        def open_spider(self,spider):
            print('开始爬虫')
    
        def process_item(self, item, spider):
            self.exporter.export_item(item)
            return item
    
        def close_spider(self,spider):
            self.fp.close()
            print('爬虫结束了')

    settings.py

    ROBOTSTXT_OBEY = False
    
    DOWNLOAD_DELAY = 1
    
    DEFAULT_REQUEST_HEADERS = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      # 'Accept-Language': 'en',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
    }
    ITEM_PIPELINES = {
    'qsbk.pipelines.QsbkPipeline': 300,
    }

    start.py

    from scrapy import cmdline
    
    cmdline.execute("scrapy crawl qsbk_spider".split())
     
     
     
  • 相关阅读:
    (转)Centos7安装Docker
    Rust一些设计的不好的地方
    Maven pom.xml的properties配置
    进程从硬盘读取文件的过程
    监控日志并上报阿里云日志服务
    rsyslog 系统日志收集上报(可增加自定义项目日志)
    获取Linux mac地址(centos与ubuntu通用)
    shell 判断操作系统
    9999
    【2022.01.18】树莓派几个好用适合的docker
  • 原文地址:https://www.cnblogs.com/gaidy/p/12096561.html
Copyright © 2020-2023  润新知