全站数据爬取,主要使用yield scrapy.Request
import scrapy
class XiaohuaSpider(scrapy.Spider):
name = 'xiaohua'
# 定义请求网址
start_urls = ['http://www.xiaohuar.com/daxue/']
# 定义第二次请求的开始页数,如果为1,请求报错
num = 2
def parse(self, response):
# 获取图片的页数
page_total = int(response.xpath('//*[@id="wrap"]/div/nav/ul/a[1]/b/text()').extract_first()) // 25
# 拿到每个div
div_list = response.xpath('//*[@id="wrap"]/div/div/div')
# 遍历div获取人物名称
for div in div_list:
name = div.xpath('./div[1]/div/a/text()').extract_first()
if self.num<=page_total:
# 使用字符串拼接请求的url
url = f'http://www.xiaohuar.com/daxue/index_{self.num}.html'
#
print('当前url',url)
# 每次请求之前使num页数+1
self.num +=1
print('当前页数是',self.num)
# 手动请求发送callback专门用来做数据解析
yield scrapy.Request(url=url,callback=self.parse)