• scrapy 爬取小说


    QiushuSpider

    # -*- coding: utf-8 -*-
    import scrapy
    import time
    from qiushu.items import QiushuItem
    
    class QiushuspiderSpider(scrapy.Spider):
        name = 'QiushuSpider'
        allowed_domains = ['www.qiushu.cc']
        start_urls = ['http://www.qiushu.cc/']
    
        def parse(self, response):
            '''解析分类列表'''
            # TODO 解析分类
            # 分类URLs
            links = response.xpath('//p[@class="hot_tips"]/a/@href').extract()
            # 所有类型链接
            for i in links:
                url = 'http://www.qiushu.cc' + i
                yield scrapy.Request(url, callback=self.parse_books, dont_filter=True)
    
        def parse_books(self, response):
            '''解析书籍列表'''
            # TODO: 解析书籍列表
            # time.sleep(2)
            book_url = []
            for i in response.xpath('//*[@id="main"]/div[1]/div/div/ul/li'):
                book_dan_url = ''.join(i.xpath('.//span[@class="t1"]/a/@href').extract_first())
                book_url.append(book_dan_url)
                print('*' * 30, book_dan_url)
            # import ipdb as pdb; pdb.set_trace()
            print('*' * 30, book_url)
            for i in book_url:
                yield scrapy.Request(i, callback=self.parse_section, dont_filter=True)
            # TODO: 处理下一页
            xia_url = ''.join(response.xpath('//*[@class="next"]/@href').extract())
            if bool(xia_url):
                yield scrapy.Request(xia_url, callback=self.parse_books, dont_filter=True)
    
        def parse_section(self, response):
            '''解析具体的某一章'''
            # TODO: 解析具体的章
            item = QiushuItem()
            # 书名
            item['name'] = ''.join(response.xpath('//div[@class="title"]/h1/text()').extract())
            # 作者
            item['author'] = ''.join(response.xpath('//div[@class="title"]/span/text()').extract())
            # 书籍分类标签
            item['booktype'] = ''.join(response.xpath('//*[@id="main"]/div[2]/text()[2]').extract()).split('>')[1]
            # 书籍状态
            item['state'] = ''.join(response.xpath('//*[@id="main"]/div[2]/span/text()').extract())
            # 书籍的有效地址
            item['showUrl'] = response.url
            # 书籍描述
            item['describe'] = ''.join(response.xpath('//div[@class="intro"]/p/text()').extract())
            yield item

    items

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class QiushuItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        # 书名
        name = scrapy.Field()
        # 作者
        author = scrapy.Field()
        # 书籍分类标签
        booktype = scrapy.Field()
        # 书籍状态
        state = scrapy.Field()
        # 书籍的有效地址
        showUrl = scrapy.Field()
        # 书籍描述
        describe = scrapy.Field()

  • 相关阅读:
    Android签名详解(debug和release)
    Java反射机制的学习
    Android应用开发中如何使用隐藏API(转)
    asp.net购物车,订单以及模拟支付宝支付(二)---订单表
    asp.net购物车,订单以及模拟支付宝支付(一)---购物车表及添加购物车流程
    asp.net权限控制的方式
    .Net使用程序发送邮件时的问题
    Word2016“此功能看似已中断 并需要修复”问题解决办法
    C#字符串来袭——因为爱,所以爱
    C#时间的味道——任时光匆匆我只在乎你
  • 原文地址:https://www.cnblogs.com/sxqfuture/p/10256485.html
Copyright © 2020-2023  润新知