#数据源:糗事百科
爬虫代码:
1 import scrapy 2 from scrapy.linkextractors import LinkExtractor 3 from scrapy.spiders import CrawlSpider, Rule 4 5 6 class QiubaiSpider(CrawlSpider): 7 name = 'qiubai' 8 # allowed_domains = ['www.xxx.com'] 9 start_urls = ['https://www.qiushibaike.com/pic/'] 10 #第一页之后的所有url 11 link=LinkExtractor(allow=r'/pic/page/d+?') 12 #第一页的url 13 link1=LinkExtractor(allow=r'/pic/$') 14 rules = ( 15 Rule(link, callback='parse_item', follow=True), 16 Rule(link1, callback='parse_item', follow=True), 17 ) 18 19 def parse_item(self, response): 20 print(response.xpath('//*[@id="qiushi_tag_121352783"]/a/div/span/text()').extract_first())