• scrapy解析库之Xpath( Selectors)


    #1 //与/
    #2 text
    #3、extract与extract_first:从selector对象中解出内容
    #4、属性:xpath的属性加前缀@
    #4、嵌套查找
    #5、设置默认值
    #4、按照属性查找
    #5、按照属性模糊查找
    #6、正则表达式
    #7、xpath相对路径
    #8、带变量的xpath
    response.selector.css()
    response.selector.xpath()
    可简写为
    response.css()
    response.xpath()
    
    #1 //与/
    response.xpath('//body/a/')#
    response.css('div a::text')
    
    >>> response.xpath('//body/a') #开头的//代表从整篇文档中寻找,body之后的/代表body的儿子
    []
    >>> response.xpath('//body//a') #开头的//代表从整篇文档中寻找,body之后的//代表body的子子孙孙
    [<Selector xpath='//body//a' data='<a href="image1.html">Name: My image 1 <'>, <Selector xpath='//body//a' data='<a href="image2.html">Name: My image 2 <'>, <Selector xpath='//body//a' data='<a href="
    image3.html">Name: My image 3 <'>, <Selector xpath='//body//a' data='<a href="image4.html">Name: My image 4 <'>, <Selector xpath='//body//a' data='<a href="image5.html">Name: My image 5 <'>]
    
    #2 text
    >>> response.xpath('//body//a/text()')
    >>> response.css('body a::text')
    
    #3、extract与extract_first:从selector对象中解出内容
    >>> response.xpath('//div/a/text()').extract()
    ['Name: My image 1 ', 'Name: My image 2 ', 'Name: My image 3 ', 'Name: My image 4 ', 'Name: My image 5 ']
    >>> response.css('div a::text').extract()
    ['Name: My image 1 ', 'Name: My image 2 ', 'Name: My image 3 ', 'Name: My image 4 ', 'Name: My image 5 ']
    
    >>> response.xpath('//div/a/text()').extract_first()
    'Name: My image 1 '
    >>> response.css('div a::text').extract_first()
    'Name: My image 1 '
    
    #4、属性:xpath的属性加前缀@
    >>> response.xpath('//div/a/@href').extract_first()
    'image1.html'
    >>> response.css('div a::attr(href)').extract_first()
    'image1.html'
    
    #4、嵌套查找
    >>> response.xpath('//div').css('a').xpath('@href').extract_first()
    'image1.html'
    
    #5、设置默认值
    >>> response.xpath('//div[@id="xxx"]').extract_first(default="not found")
    'not found'
    
    #4、按照属性查找
    response.xpath('//div[@id="images"]/a[@href="image3.html"]/text()').extract()
    response.css('#images a[@href="image3.html"]/text()').extract()
    
    #5、按照属性模糊查找
    response.xpath('//a[contains(@href,"image")]/@href').extract()
    response.css('a[href*="image"]::attr(href)').extract()
    
    response.xpath('//a[contains(@href,"image")]/img/@src').extract()
    response.css('a[href*="imag"] img::attr(src)').extract()
    
    response.xpath('//*[@href="image1.html"]')
    response.css('*[href="image1.html"]')
    
    #6、正则表达式
    response.xpath('//a/text()').re(r'Name: (.*)')
    response.xpath('//a/text()').re_first(r'Name: (.*)')
    
    #7、xpath相对路径
    >>> res=response.xpath('//a[contains(@href,"3")]')[0]
    >>> res.xpath('img')
    [<Selector xpath='img' data='<img src="image3_thumb.jpg">'>]
    >>> res.xpath('./img')
    [<Selector xpath='./img' data='<img src="image3_thumb.jpg">'>]
    >>> res.xpath('.//img')
    [<Selector xpath='.//img' data='<img src="image3_thumb.jpg">'>]
    >>> res.xpath('//img') #这就是从头开始扫描
    [<Selector xpath='//img' data='<img src="image1_thumb.jpg">'>, <Selector xpath='//img' data='<img src="image2_thumb.jpg">'>, <Selector xpath='//img' data='<img src="image3_thumb.jpg">'>, <Selector xpa
    th='//img' data='<img src="image4_thumb.jpg">'>, <Selector xpath='//img' data='<img src="image5_thumb.jpg">'>]
    
    #8、带变量的xpath
    >>> response.xpath('//div[@id=$xxx]/a/text()',xxx='images').extract_first()
    'Name: My image 1 '
    >>> response.xpath('//div[count(a)=$yyy]/@id',yyy=5).extract_first() #求有5个a标签的div的id
    'images'

    https://docs.scrapy.org/en/latest/topics/selectors.html

  • 相关阅读:
    grpc 浅谈
    ticket项目所得
    odoo 安装
    Ubuntu 设置系统环境变量和开机自启动
    supervisor 错误集合
    Python之路--前端知识--HTML
    Python之路--Python基础14--MySQL
    Python之路--Python基础13--异步IO、RedisMemcached缓存、RabbitMQ队列
    Python之路--Python基础12--并发编程之协程
    Python之路--Python基础11--并发编程之线程
  • 原文地址:https://www.cnblogs.com/lujiacheng-Python/p/10162619.html
Copyright © 2020-2023  润新知