# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class DgSpider(CrawlSpider): name = 'dg' # allowed_domains = ['https://www.dygod.net'] start_urls = ['https://www.dygod.net/html/gndy/dyzz/index.html'] rules = ( Rule(LinkExtractor(allow=r'https://www.dygod.net/html/gndy/dyzz/index_d+.html')), Rule(LinkExtractor(allow=r'https://www.dygod.net/html/gndy/dyzz/d+/d+.html'), callback='parse_item', follow=True), ) def parse_item(self, response): item = {} #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() item['name'] = response.css('div[id*=Zoom] p:nth-child(3)::text').get() # item['time'] = response.xpath('//div[@id="description"]').get() return item
刚开始报错,因为 start_urls的https://www.dygod.net/html/gndy/dyzz/index.html最后多了一个/
后来继续报错,filter offline ....dygod.net,没搞清楚就直接把allowed_domains注释掉了就好了。。。,
但是扒下来的汉字都是u25ceu7247u3000u3000u540du3000 Unicode模式