scrapy多个page爬取, post请求, 通过爬到的URL继续发请求爬页面

scrapy多个page的爬取

import scrapy
from bossPro.items import BossproItem


class BossSpider(scrapy.Spider):
    name = 'boss'
    # allowed_domains = ['www.xxx.com']
    start_urls = [
        'https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&scity=101010100&industry=&position=']

    url = 'https://www.zhipin.com/c101010100/?query=python爬虫&page=%d&ka=page-2'
    page = 1

    # 解析+管道持久化存储
    def parse(self, response):
        li_list = response.xpath('//div[@class="job-list"]/ul/li')
        for li in li_list:
            job_name = li.xpath('.//div[@class="info-primary"]/h3/a/div/text()').extract_first()
            salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()').extract_first()
            company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first()

            # 实例化一个item对象
            item = BossproItem()
            # 将解析到的数据全部封装到item对象中
            item['job_name'] = job_name
            item['salary'] = salary
            item['company'] = company

            # 将item提交给管道
            yield item

        if self.page <= 3:
            print('if 执行!!!')
            self.page += 1
            new_url = format(self.url % self.page)
            print(new_url)
            # 手动请求发送
            yield scrapy.Request(url=new_url, callback=self.parse)

scrapy post请求

import scrapy
from scrapy1.items import Scrapy1Item



class MyspiderSpider(scrapy.Spider):
    name = 'qiubai'
    # allowed_domains = ['www.baidu.com']
    start_urls = ['https://fanyi.baidu.com/sug']

    data = {'kw': 'cat'}

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.FormRequest(url=url, formdata=self.data, callback=self.parse)

    def parse(self, response):
        item = Scrapy1Item()
        item['title'] = 'cat'
        item['content'] = response.text

        yield item

scrapy通过爬到的URL继续发请求爬页面

import scrapy
from scrapy1.items import Scrapy1Item


class MyspiderSpider(scrapy.Spider):
    name = 'qiubai'
    # allowed_domains = ['www.baidu.com']
    start_urls = ['https://www.4567tv.tv/frim/index1.html']

    def get_detail(self, response):
        item = response.meta['item']
        detail = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
        item['content'] = detail
        yield item

    def parse(self, response):
        div_list = response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]')
        # print(div_list)
        for li in div_list:
            item = Scrapy1Item()
            name = li.xpath('./div/a/@title').extract_first()
            href = 'https://www.4567tv.tv' + li.xpath('./div/a/@href').extract_first()
            item['title'] = name
            yield scrapy.Request(url=href, callback=self.get_detail, meta={'item':item})

相关阅读:
R语言 ggplot2包
 C++实现景区信息管理系统
 linux系统目录介绍
 Python中的赋值、深拷贝与浅拷贝（内存地址）
三大相关系数: pearson, spearman, kendall（python示例实现）
Xshell删除键不好使：删除显示退格^H
Spark SQL中出现 CROSS JOIN 问题解决
 Python apply函数
 Python Dataframe 分组排序和 Modin
Python 中的时间处理包datetime和arrow
原文地址：https://www.cnblogs.com/NachoLau/p/10472664.html