• Scrapy的基本使用


    爬取:http://quotes.toscrape.com

    单页面

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class QuoteSpider(scrapy.Spider):
        name = 'quote'
        allowed_domains = ['quotes.toscrape.com']
        start_urls = ['http://quotes.toscrape.com/']
        """
        知识点
            1. text()获取标签的text
            2. @属性  获取属性的值
            3. extract()查找多个    extract_first() 查找一个
        """
        def parse(self, response):
            # print(response.text)
            quotes = response.xpath('//div[@class="col-md-8"]/div[@class="quote"]')
            # print(quotes)''
            for quote in quotes:
                print('=' * 20)
                # print(quote)
                # extract_first() 查找一个
                text = quote.xpath('.//span[@class="text"]/text()').extract_first()
                print(text)
                author = quote.xpath('.//span/small[@class="author"]/text()').extract_first()
                print(author)
                # extract()查找多个
                tags = quote.xpath('.//div[@class="tags"]/a[@class="tag"]/@href').extract()
                print(tags)

    所有页面

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class QuoteSpider(scrapy.Spider):
        name = 'quote'
        allowed_domains = ['quotes.toscrape.com']
        start_urls = ['http://quotes.toscrape.com/']
        """
        知识点
            1. text()获取标签的text
            2. @属性  获取属性的值
            3. extract()查找多个    extract_first() 查找一个
            4. response.urljoin()     url拼接
            5. scrapy.Request(url=_next, callback=self.parse)   回调
        """
        def parse(self, response):
            # print(response.text)
            quotes = response.xpath('//div[@class="col-md-8"]/div[@class="quote"]')
            # print(quotes)''
            for quote in quotes:
                print('=' * 20)
                # print(quote)
                # extract_first() 查找一个
                text = quote.xpath('.//span[@class="text"]/text()').extract_first()
                print(text)
                author = quote.xpath('.//span/small[@class="author"]/text()').extract_first()
                print(author)
                # extract()查找多个
                tags = quote.xpath('.//div[@class="tags"]/a[@class="tag"]/@href').extract()
                print(tags)
            print('>' * 40)
            next_url = response.xpath('//div[@class="col-md-8"]/nav/ul[@class="pager"]/li[@class="next"]/a/@href').extract_first()
            print(next_url)
            # 拼接url
            _next = response.urljoin(next_url)
            print(_next)
            # callback 回调函数 
            yield scrapy.Request(url=_next, callback=self.parse)

    补充

    from scrapy import Spider, FormRequest
    FormRequest(ulr= '', callback='', formdata='')
  • 相关阅读:
    stat命令的实现-mysate 2075310
    实现mypwd
    2019-2020-1 20175310 20175317 20175320 实验五 通讯协议设计
    2019-2020-1 20175310 20175317 20175320 实验四 外设驱动程序设计
    课上测试
    2019-2020-1 20175310 《信息安全系统设计基础》第九周学习总结
    2019-2020-1 20175310 20175317 20175320 实验三 实时系统
    2019-2020-1 20175310 20175317 20175320 实验二 固件程序设计
    2019-2020-1 20175310 20175317 20175320 实验一 开发环境的熟悉
    2019-2020-1 20175310 《信息安全系统设计基础》第6周学习总结
  • 原文地址:https://www.cnblogs.com/wt7018/p/11729534.html
Copyright © 2020-2023  润新知