Scrapy爬虫-spider.py

 1 class xiaoshuoSpider(scrapy.Spider):
 2     name = "freenovel"
 3     headers={
 4         'Upgrade - Insecure - Requests': '1',
 5     }
 6     def start_requests(self):
 7         #完本、免费小说
 8         start_url=["url起始网址"]
 9         for url in start_url:
10             yield scrapy.Request(url=url, headers=self.headers,callback=self.first_parse)
11 
12     def first_parse(self, response):
13        sel=Selector(response)
14        category=sel.css('div[class="select-list"] div ul[type="category"] li a::text').extract()
15        category_url=sel.css('div[class="select-list"] div ul[type="category"] li a::attr(href)').extract()
16        items=[]
17        for i in range(1,len(category_url)):
18            item=XiaoshuoItem()
19            item['category']=category[i]
20            item['category_url']="https:"+category_url[i]
21            items.append(item)
22        for item in items:
23            yield scrapy.Request(url=item['category_url'],meta={"category":item['category']},callback=self.second_parse,headers=self.headers)
24 
25     def second_parse(self,response):
26         sel=Selector(response)
27         novel_url=sel.css('div[class="book-mid-info"] h4 a::attr(href)').extract()
28         item=XiaoshuoItem()
29         item['category']=response.meta['category']
30         yield scrapy.Request(url="https:" + novel_url[1] + "#Catalog",callback=self.article_parse,
31                              headers=self.headers)
32         for i in range(len(novel_url)):
33             novel_url[i]="https:" + novel_url[i] + "#Catalog"
34             yield scrapy.Request(url=novel_url[i], meta={"category":item['category']},callback=self.article_parse, headers=self.headers)
35 
36     def article_parse(self, response):
37         sel=Selector(response)
38         article_name=sel.xpath('//h1/em/text()').extract_first()
39         article_url=sel.css(
40             'div[id="j-catalogWrap"] div[class="volume-wrap"] div[class="volume"] ul li a::attr(href)').extract_first()
41         article_url="https:" + article_url
42         item=XiaoshuoItem()
43         item['article_name']=article_name
44         item['category']=response.meta['category']
45         yield scrapy.Request(url=article_url, meta={'article_name': item['article_name'],"category":item['category']}, callback=self.detail_parse,
46                              headers=self.headers)
47 
48     def detail_parse(self, response):
49         sel=Selector(response)
50         content=""
51         item=XiaoshuoItem()
52         content_list=sel.css(
53             'div[id="j_chapterBox"] div[class="text-wrap"] div[class="main-text-wrap"] div[class="read-content j_readContent"] p::text').extract()
54         content_name=sel.css('h3[class="j_chapterName"]::text').extract_first()
55         next_page=sel.css('a[id="j_chapterNext"]::attr(href)').extract_first()
56         for content_one in content_list:
57             content+=content_one
58         item['content']=content
59         item['content_name']=content_name
60         item['article_name']=response.meta['article_name']
61         item['category']=response.meta['category']
62         yield item
63         if next_page is not None:
64             next_page="https:" + next_page
65             yield scrapy.Request(url=next_page, meta={'article_name': item['article_name'],"category":item['category']}, callback=self.detail_parse,
66                                  headers=self.headers)

相关阅读:
Microsoft.Office.Inter.Excel.dll在調用時可能會出現如下錯誤
 Proe 导出PDF Vb.net
给Eclipse安装Google app engine插件
 VC++ 2013 开发windows窗体程序
 GitHub使用说明
 c# 发送邮件
 c# aes 加密解密
 sourceforge软件下载方式
 keyCode转换成值
 前端写代码自动刷新神器Browsersync
原文地址：https://www.cnblogs.com/ShadowXie/p/9699921.html