- CrawlSpider就是爬虫类Spider的一个子类
使用流程
- 创建一个基于CrawlSpider的一个爬虫文件 :scrapy genspider -t crawl spider_name www.xxx.com
- 构造链接提取器和规则解析器
- 链接提取器:
- 作用:可以根据指定的规则进行指定连接的提取
- 提取的规则: allow = "正则表达式"
- 会先在全局匹配所有的url,然后根据参数allow的规则匹配需要的链接
- 规则解析器
- 作用:获取链接提取器提取到的链接,对其进行请求发送,根据指定的规则对请求道的页面源码数据进行数据解析.-
- fllow = True 参数的作用: 将链接提取器继续作用到链接提取器提取到的页码链接所对应的页面中
- 链接提取器:
- 注意事项:
- 链接提取器和规则解析器是一一对应关系
示例代码
-
基于CrawlSpider实现深度数据爬取
- spider文件
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from sunspider.items import SunspiderItem, SunspiderItemSecond class SunSpiderSpider(CrawlSpider): name = 'sun_spider' # allowed_domains = ['www.xxx.com'] start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page='] # 链接提取器 两层数据爬取,写两个链接提取器,链接提取器和规则解析器是一一对应关系 link = LinkExtractor(allow=r'type=4&page=d+') link_detail = LinkExtractor(allow=r'question/d+/d+.shtml') rules = ( # 实例化Rule(规则解析器)的对象 Rule(link, callback='parse_item', follow=True), Rule(link_detail, callback='parse_item_content', follow=True), ) def parse_item(self, response): tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr') for tr in tr_list: title = tr.xpath('./td[2]/a[2]/@title').extract_first() status = tr.xpath('./td[3]/span/text()').extract_first() num = tr.xpath('./td[1]/text()').extract_first() item = SunspiderItem() item['title'] = title item['status'] = status item['num'] = num yield item def parse_detail(self, response): content = response.xpath('/html/body/div[9]/table[2]/tbody/tr[1]//text()').extract() content = ''.join(content) num = response.xpath('/html/body/div[9]/table[1]/tbody/tr/td[2]/span[2]/text()').extract_first() if num: num = num.split(':')[-1] item = SunspiderItemSecond() item['content'] = content item['num'] = num yield item
-
items.py文件
import scrapy # 定义两个类,并且通过某种方式(num)标识两个类之间的对应关系 class SunspiderItem(scrapy.Item): title = scrapy.Field() status = scrapy.Field() num = scrapy.Field() class SunspiderItemSecond(scrapy.Item): content = scrapy.Field() num = scrapy.Field()
-
pipelines.py文件
- 存储数据
class SunspiderPipeline(object): def process_item(self, item, spider): # 判断item是哪一个类封装 if item.__class__.__name__ == "SunspiderItemSecond": content = item['content'] num = item['num'] print(content, num) else: title = item['title'] status = item['status'] num = item['num'] print(title, status, num) return item