• 三、Scrapy中选择器用法


    官方示例源码
    <html>
     <head>
      <base href='http://example.com/' />
      <title>Example website</title>
     </head>
     <body>
      <div id='images'>
       <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
       <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
       <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
       <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
       <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
      </div>
     </body>
    </html>

    # scrapy shell http://doc.scrapy.org/en/latest/_static/selectors-sample1.html

    >>> response.xpath('//title/text()')
    [<Selector (text) xpath=//title/text()>]

    >>> response.css('title::text')
    [<Selector (text) xpath=//title/text()>]

    >>> response.css('title::text').extract()
    [u'Example website']

    >>> response.xpath('//title/text()').extract()
    [u'Example website']

    >>> response.xpath('//base/@href').extract()
    [u'http://example.com/']

    >>> response.css('base::attr(href)').extract()
    [u'http://example.com/']


    >>> response.xpath('//a[contains(@href, "image")]/@href').extract()
    [u'image1.html',
     u'image2.html',
     u'image3.html',
     u'image4.html',
     u'image5.html']

    >>> response.css('a[href*=image]::attr(href)').extract()
    [u'image1.html',
     u'image2.html',
     u'image3.html',
     u'image4.html',
     u'image5.html']

    >>> response.xpath('//a/@href')]').extract()
    ['image1.html',
     'image2.html',
     'image3.html',
     'image4.html',
     'image5.html']

    >>> response.css('a::attr(href)').extract()
    ['image1.html',
     'image2.html',
     'image3.html',
     'image4.html',
     'image5.html']

    >>> response.xpath('//div[@id="image"]').css('img::attr(src)').extract()
    ['image1_thumb.jpg',
     'image2_thumb.jpg',
     'image3_thumb.jpg',
     'image4_thumb.jpg',
     'image5_thumb.jpg']
     
    >>> response.xpath('//div[@id="image"]').css('img::attr(src)').extract_first()
    'image1_thumb.jpg'

    # 默认值,查找不存在的元素,使用默认值
    >>> response.xpath('//div[@id="image"]').css('img::attr(data-src)').extract_first(deafult='')
    ''

    >>> response.xpath('//a[contains(@href, "image")]/img/@src').extract()
    [u'image1_thumb.jpg',
     u'image2_thumb.jpg',
     u'image3_thumb.jpg',
     u'image4_thumb.jpg',
     u'image5_thumb.jpg']

    >>> response.css('a[href*=image] img::attr(src)').extract()
    [u'image1_thumb.jpg',
     u'image2_thumb.jpg',
     u'image3_thumb.jpg',
     u'image4_thumb.jpg',
     u'image5_thumb.jpg']
     
    >>> links = response.xpath('//a[contains(@href, "image")]')
    >>> links.extract()
    [u'<a href="image1.html">Name: My image 1 <br><img src="image1_thumb.jpg"></a>',
     u'<a href="image2.html">Name: My image 2 <br><img src="image2_thumb.jpg"></a>',
     u'<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>',
     u'<a href="image4.html">Name: My image 4 <br><img src="image4_thumb.jpg"></a>',
     u'<a href="image5.html">Name: My image 5 <br><img src="image5_thumb.jpg"></a>']

    >>> for index, link in enumerate(links):
            args = (index, link.xpath('@href').extract(), link.xpath('img/@src').extract())
            print 'Link number %d points to url %s and image %s' % args

    Link number 0 points to url [u'image1.html'] and image [u'image1_thumb.jpg']
    Link number 1 points to url [u'image2.html'] and image [u'image2_thumb.jpg']
    Link number 2 points to url [u'image3.html'] and image [u'image3_thumb.jpg']
    Link number 3 points to url [u'image4.html'] and image [u'image4_thumb.jpg']
    Link number 4 points to url [u'image5.html'] and image [u'image5_thumb.jpg']


    >>> response.xpath('//a/text()').extract()
    ['Name:My image 1',
     'Name:My image 2',
     'Name:My image 3',
     'Name:My image 4',
     'Name:My image 5']
     
    >>> response.css('a::text').extract()
    ['Name:My image 1',
     'Name:My image 2',
     'Name:My image 3',
     'Name:My image 4',
     'Name:My image 5']
     
     
    >>> response.xpath('//a[contains(@href, "image")]/@href').extract()
    ['image1.html',
     'image2.html',
     'image3.html',
     'image4.html',
     'image5.html']
     
    >>> response.css('a[href*=image] img::attr(href)').extract()
    ['image1.html',
     'image2.html',
     'image3.html',
     'image4.html',
     'image5.html']

    # 使用正则    
    >>> response.xpath('//a[contains(@href, "image")]/text()').re(r'Name:s*(.*)')
    [u'My image 1',
     u'My image 2',
     u'My image 3',
     u'My image 4',
     u'My image 5']

    >>> response.xpath('//a[contains(@href, "image")]/text()').re_first(r'Name:s*(.*)')
    'My image 1'

    >>> response.xpath('//a/text()').re(r'Name:s*(.*)')
    ['My image 1',
     'My image 2',
     'My image 3',
     'My image 4',
     'My image 5']

    >>> response.xpath('//a/text()').re_first(r'Name:s*(.*)')
    'My image 1'

    >>> response.css('a::text').re(r'Name:s*(.*)')
    ['My image 1',
     'My image 2',
     'My image 3',
     'My image 4',
     'My image 5']

    #使用strip()再次处理字符串中的空格,注意跟前面的相比较
    re_first('Name:(.*)').strip()
    re(r'Name:s*(.*)')
    >>> response.css('a::text').re_first('Name:(.*)').strip()
    'My image 1'

    # 获取所有的a标签超链接
    >>> response.css('a').extract()
    ['<a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>',
    '<a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>',
    '<a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>',
    '<a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>',
    '<a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>']

    >>> response.css('a').extract_first()
    '<a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>'

  • 相关阅读:
    intellij IDE 破解 简单 License server 法
    Unsupported major.minor version 52.0错误和 jdbc odbc
    MyEclipse优化攻略搜集
    感兴趣的WebGL ,来自微博的一个全景星空图~
    ie/chorme 清除缓存 刷新js,css
    PLSQL PL/SQL Developer Oracle 使用技巧 常用设置 卡顿问题 病毒防范( 附带:配置文件)
    MyEclipse eclipse console edit packageExplorer 颜色设置、个性化、常用设置
    java Map 四种遍历方法
    Eclipse MyEclipse 反编译.class文件 myeclipse source not found
    打印菱形
  • 原文地址:https://www.cnblogs.com/sanduzxcvbnm/p/10292654.html
Copyright © 2020-2023  润新知