以爬取阳光阳光热线问政平台网站为例,进行详情页的爬取。
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from yanguang.items import YanguangItem 4 5 class SunSpider(scrapy.Spider): 6 name = 'sun' 7 allowed_domains = ['sun0769.com'] 8 start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4'] 9 10 def parse(self, response): 11 tr_list=response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr") 12 for tr in tr_list: 13 item=YanguangItem() 14 item['title']=tr.xpath("./td[2]/a[@class='news14']/@title").extract_first() 15 item["href"]=tr.xpath("./td[2]/a[@class='news14']/@href").extract_first() 16 item["publish_date"]=tr.xpath("./td[last()]/text()").extract_first() 17 18 yield scrapy.Request( 19 item["href"], 20 callback=self.parse_detail, 21 meta={"item":item}, 22 ) 23 #翻页 24 next_url=response.xpath(".//a[text()='>']/@href").extract_first() 25 if next_url is not None: 26 yield scrapy.Request( 27 next_url, 28 callback=self.parse() 29 ) 30 31 32 def parse_detail(self,response): #处理详情页 33 item=response.meta["item"] 34 item["content"]=response.xpath("//div[@class='c1 text14_2']//text()").extract() 35 item["content_img"] = response.xpath("//div[@class='c1 text14_2']//img/@src").extract() 36 item["content_img"] =["http://wz.sun0769.com"+i for i in item["content_img"]] 37 yield item
下面为pipelines.py文件中对爬取的数据处理操作。
1 import re 2 class YanguangPipeline(object): 3 def process_item(self, item, spider): 4 item["content"]=self.process_content(item["content"]) 5 print(item) 6 return item 7 8 def process_content(self,content):#文本内容的处理 9 content=[re.sub(r"xa0|s","",i)for i in content] 10 content=[i for i in content if len(i)>0]#去除列表中的空字符串 11 return content
在settings.py文件中修改USER_AGENT的内容是对方服务器无法一眼看出我们的请求是爬虫。
默认settings.py文件中的USER_AGENT为:
1 # Crawl responsibly by identifying yourself (and your website) on the user-agent 2 USER_AGENT = 'tencent (+http://www.yourdomain.com)'
将settings.py文件中的USER_AGENT修改为:
1 # Crawl responsibly by identifying yourself (and your website) on the user-agent 2 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'