• Scrapy爬虫-spider.py


     1 class xiaoshuoSpider(scrapy.Spider):
     2     name = "freenovel"
     3     headers={
     4         'Upgrade - Insecure - Requests': '1',
     5     }
     6     def start_requests(self):
     7         #完本、免费小说
     8         start_url=["url起始网址"]
     9         for url in start_url:
    10             yield scrapy.Request(url=url, headers=self.headers,callback=self.first_parse)
    11 
    12     def first_parse(self, response):
    13        sel=Selector(response)
    14        category=sel.css('div[class="select-list"] div ul[type="category"] li a::text').extract()
    15        category_url=sel.css('div[class="select-list"] div ul[type="category"] li a::attr(href)').extract()
    16        items=[]
    17        for i in range(1,len(category_url)):
    18            item=XiaoshuoItem()
    19            item['category']=category[i]
    20            item['category_url']="https:"+category_url[i]
    21            items.append(item)
    22        for item in items:
    23            yield scrapy.Request(url=item['category_url'],meta={"category":item['category']},callback=self.second_parse,headers=self.headers)
    24 
    25     def second_parse(self,response):
    26         sel=Selector(response)
    27         novel_url=sel.css('div[class="book-mid-info"] h4 a::attr(href)').extract()
    28         item=XiaoshuoItem()
    29         item['category']=response.meta['category']
    30         yield scrapy.Request(url="https:" + novel_url[1] + "#Catalog",callback=self.article_parse,
    31                              headers=self.headers)
    32         for i in range(len(novel_url)):
    33             novel_url[i]="https:" + novel_url[i] + "#Catalog"
    34             yield scrapy.Request(url=novel_url[i], meta={"category":item['category']},callback=self.article_parse, headers=self.headers)
    35 
    36     def article_parse(self, response):
    37         sel=Selector(response)
    38         article_name=sel.xpath('//h1/em/text()').extract_first()
    39         article_url=sel.css(
    40             'div[id="j-catalogWrap"] div[class="volume-wrap"] div[class="volume"] ul li a::attr(href)').extract_first()
    41         article_url="https:" + article_url
    42         item=XiaoshuoItem()
    43         item['article_name']=article_name
    44         item['category']=response.meta['category']
    45         yield scrapy.Request(url=article_url, meta={'article_name': item['article_name'],"category":item['category']}, callback=self.detail_parse,
    46                              headers=self.headers)
    47 
    48     def detail_parse(self, response):
    49         sel=Selector(response)
    50         content=""
    51         item=XiaoshuoItem()
    52         content_list=sel.css(
    53             'div[id="j_chapterBox"] div[class="text-wrap"] div[class="main-text-wrap"] div[class="read-content j_readContent"] p::text').extract()
    54         content_name=sel.css('h3[class="j_chapterName"]::text').extract_first()
    55         next_page=sel.css('a[id="j_chapterNext"]::attr(href)').extract_first()
    56         for content_one in content_list:
    57             content+=content_one
    58         item['content']=content
    59         item['content_name']=content_name
    60         item['article_name']=response.meta['article_name']
    61         item['category']=response.meta['category']
    62         yield item
    63         if next_page is not None:
    64             next_page="https:" + next_page
    65             yield scrapy.Request(url=next_page, meta={'article_name': item['article_name'],"category":item['category']}, callback=self.detail_parse,
    66                                  headers=self.headers)
  • 相关阅读:
    Microsoft.Office.Inter.Excel.dll在調用時可能會出現如下錯誤
    Proe 导出PDF Vb.net
    给Eclipse安装Google app engine插件
    VC++ 2013 开发windows窗体程序
    GitHub使用说明
    c# 发送邮件
    c# aes 加密解密
    sourceforge软件下载方式
    keyCode转换成值
    前端写代码自动刷新神器Browsersync
  • 原文地址:https://www.cnblogs.com/ShadowXie/p/9699921.html
Copyright © 2020-2023  润新知