• Xpath解析库的使用


    ### Xpath常用规则
    ## nodename     选取此节点的所有子节点
    ## /            从当前节点选取直接子节点
    ## //           从当前节点选取子孙节点
    ## .            选取当前节点
    ## ..           选取当前节点的父节点
    ## @            选取属性
    
    ### 测试文本
    text = '''
    <ul id="dmr" name="liebiao">
    <li data-closeper="" aria-label="查看更多" role="menuitem" aria-haspopup="true" data-groupid="104" class="J_Cat a-all">
    <a data-cid="1" data-dataid="222878" >家电</a>
    <a data-cid="1" data-dataid="222908" >数码</a>
    <a data-cid="1" data-dataid="222879" >手机
    
    <i aria-hidden="true" class="tb-ifont service-arrow">�</i>
    </li>
    </ul>
    '''

    1. etree示例引入

    ## etree示例引入
    from lxml import etree
    
    # 构造一个etree的HTML节点对象(可供Xpath解析)
    html = etree.HTML(text)
    # 读取text文本内容进行构造节点对象
    html2 = etree.parse('./text', etree.HTMLParser())
    # 用tostring方法可以修正html代码,如上面代码缺失的</a>标签
    result = etree.tostring(html)
    result2 = etree.tostring(html2)
    print(html, html2)
    print(type(html), type(html2))
    '''
    输出内容:
    <Element html at 0x2b47848> <lxml.etree._ElementTree object at 0x0000000002B47788>
    <class 'lxml.etree._Element'> <class 'lxml.etree._ElementTree'>
    '''
    # 输出修正后的html代码
    print(result.decode('utf-8'))
    print(result2.decode('utf-8'))

    2. 提取页面下的所有节点

    ## 提取页面下的所有节点
    from lxml import etree
    
    html = etree.HTML(text)
    result = html.xpath('//*')
    print(len(result))
    print(result)
    
    '''
    输出结果:
    8
    [<Element html at 0x2b539c8>, <Element body at 0x2b53948>, <Element ul at 0x2b53a08>, <Element li at 0x2b53a48>, <Element a at 0x2b53a88>, <Element a at 0x2b53b08>, <Element a at 0x2b53b48>, <Element i at 0x2b53b88>]
    '''

    3. 提取子节点

    ## 提取子节点
    from lxml import etree
    
    html = etree.parse('./text', etree.HTMLParser())
    # 通过/寻找li标签下的直接a子节点
    result = html.xpath('//li/a')
    # 通过//寻找ul标签下的a子和孙节点
    result2 = html.xpath('//ul//a')
    print(len(result), len(result2))
    print(result, result2)
    
    '''
    运行结果:
    3 3
    [<Element a at 0x2963cc8>, <Element a at 0x2963d08>, <Element a at 0x2963d48>] [<Element a at 0x2963cc8>, <Element a at 0x2963d08>, <Element a at 0x2963d48>]
    '''

    4. 提取父节点

    ## 提取父节点
    from lxml import etree
    
    html = etree.HTML(text)
    # 提取li节点中role属性为menuitem的节点的父节点的name属性内容
    result = html.xpath('//li[@role="menuitem"]/../@name')
    print(result)
    
    '''
    输出结果:
    ['liebiao']
    '''

    5. 属性匹配

    ## 属性匹配
    html = etree.HTML(text)
    # 匹配data-dataid为222878的节点
    result = html.xpath('//a[@data-dataid="222878"]')
    print(result)
    
    '''
    输出内容:
    [<Element a at 0x2973c48>]
    '''

    6. 提取文本内容

    ## 提取文本内容
    html = etree.HTML(text)
    # 匹配data-dataid为222878的节点的文本内容
    result = html.xpath('//a[@data-dataid="222878"]/text()')
    print(result)
    
    '''
    输出内容:
    ['家电']
    '''

    7. 属性值获取

    ## 属性获取
    from lxml import etree
    
    html = etree.HTML(text)
    result = html.xpath('//li/@aria-label')
    print(result)
    
    '''
    输出内容:
    ['查看更多']
    '''

    8. 属性多值匹配

    ## 属性多值匹配
    from lxml import etree
    
    html = etree.HTML(text)
    result = html.xpath('//li[@class="J_Cat"]')
    result2 = html.xpath('//li[@class="J_Cat a-all"]//text()')
    result3 = html.xpath('//li[contains(@class, "J_Cat")]//text()')
    print(result, result2, result3)
    
    '''
    输出结果:
    [] ['
    ', '家电', '
    ', '数码', '
    ', '手机
    
    ', 'ue62e', '
    '] ['
    ', '家电', '
    ', '数码', '
    ', '手机
    
    ', 'ue62e', '
    ']
    '''

    9. 多属性匹配

    ## 多属性匹配
    ## 运算符介绍
    # or        或
    # and       与
    # mod       除余
    # |         返回节点集合
    # +         加法
    # -         减法
    # *         乘法
    # =         等于
    # !=        不等于
    # <         小于
    # <=        小于或等于
    # >         大于
    # >=        大于或等于
    from lxml import etree
    
    html = etree.HTML(text)
    result = html.xpath('//li[contains(@class, "J_Cat") and @role="menuitem"]/a/text()')
    print(result)
    
    '''
    输出结果:
    ['家电', '数码', '手机
    
    ', '
    ']
    '''

    10. 按序选择,通过索引的方式进行选择

    ## 按序选择,通过索引的方式进行选择
    from lxml import etree
    
    html = etree.HTML(text)
    # 提取li节点下第一个a节点的文本内容
    print(html.xpath('//li/a[1]/text()'))
    # 提取li节点下最后一个a节点的文本内容
    print(html.xpath('//li/a[last()]/text()'))
    # 提取li节点下位置小于3的a节点的文本内容
    print(html.xpath('//li/a[position()<3]/text()'))
    # 提取li节点下倒数第2个a节点的文本内容
    print(html.xpath('//li/a[last()-1]/text()'))
    
    '''
    输出结果:
    ['手机
    
    ', '
    ']
    ['家电', '数码']
    ['数码']
    '''

    11. 节点轴选择

    ## 节点轴选择
    # ancestor轴,可以节点获取所有的祖先节点
    # attribute轴,可以获取节点的所有属性值
    # child轴,可以获取节点的所有直接子节点
    # descendant轴,可以获取节点的所有子孙节点
    # following轴,可以获取节点后的所有节点
    # following-sibling,可以获取当前节点的所有同级节点
    from lxml import etree
    
    html = etree.HTML(text)
    print(html.xpath('//li/a[1]/ancestor::*'))
    print(html.xpath('//li/a[1]/ancestor::ul'))
    print(html.xpath('//li/a[1]/attribute::*'))
    print(html.xpath('//li[1]/child::*'))
    print(html.xpath('//ul[1]/descendant::a'))
    print(html.xpath('//a[1]/following::*'))
    print(html.xpath('//a[1]/following-sibling::*'))
    
    '''
    输出结果:
    [<Element html at 0x2b53b88>, <Element body at 0x2b53b48>, <Element ul at 0x2b53d88>, <Element li at 0x2b53bc8>]
    [<Element ul at 0x2b53b48>]
    ['1', '222878']
    [<Element a at 0x2b53b48>, <Element a at 0x2b53d88>, <Element a at 0x2b53bc8>]
    [<Element a at 0x2b53b48>, <Element a at 0x2b53d88>, <Element a at 0x2b53bc8>]
    [<Element a at 0x2b53b48>, <Element a at 0x2b53d88>, <Element i at 0x2b53bc8>]
    [<Element a at 0x2b53d88>, <Element a at 0x2b53bc8>]
    '''

    12. 用Xpath解析爬取豆瓣top250

    ### 用Xpath解析爬取豆瓣top250
    
    from lxml import etree
    import requests, json
    
    def get_page(url):
        '''
        获取url网页代码
        :param url: 要爬取的网址
        :return: 网页代码
        '''
    
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            print('get page success...')
            return response.text
        else:
            exit('get page fail...')
    
    def parse_page(text):
        '''
        解析豆瓣电影top250网页代码
        :param html: 网页代码
        :return: data需要爬取的数据
        '''
    
        html = etree.HTML(text)
        items = html.xpath('//ol[@class="grid_view"]/li/div[@class="item"]')
        for item in items:
            #print(item.xpath('.//div[@class="bd"]/p[@class="quote"]/span/text()')[0])
            yield {
                'img': item.xpath('.//div[@class="pic"]//img/@src')[0],
                'details': item.xpath('.//div[@class="hd"]/a/@href')[0],
                'name': item.xpath('.//div[@class="hd"]//span[1]/text()')[0],
                'director': item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[0].split()[1],
                'actor': item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[0].split()[5] if len(item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[0].split())>5 else 'None',
                'time': item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[1].split()[0],
                'nation': item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[1].split()[2],
                'type': item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[1].split()[4:],
                'score': item.xpath('.//div[@class="bd"]/div/span[@class="rating_num"]/text()')[0],
                'introduction': item.xpath('.//div[@class="bd"]/p[@class="quote"]/span/text()') if item.xpath('.//div[@class="bd"]/p[@class="quote"]/span/text()') else 'None',
            }
    
        return items
    
    def save_to_file(data):
        '''
        保存爬取到的数据到文本文件中
        :param data:
        :return:
        '''
        with open('豆瓣电影top250.txt', 'a', encoding='utf-8') as f:
            f.write(json.dumps(data, ensure_ascii=False) + '
    ')
    
    def main(start):
        url = 'https://movie.douban.com/top250?start=' + str(start)
        text = get_page(url)
        data = parse_page(text)
        for item in data:
            print(item)
            save_to_file(item)
    
    
    
    if __name__ == '__main__':
        for i in range(10):
            start = i * 25
            main(start)
    View Code
  • 相关阅读:
    【二分图最大独立集/最小割】P3355 骑士共存问题
    【费用流+正负费用处理】UVA11613 Acme Corporation
    【费用流】P2517 [HAOI2010]订货
    【最小割】P1361 小M的作物
    【最小割】[SHOI2007]善意的投票
    【最小割+割点转换】[USACO5.4]奶牛的电信Telecowmunication
    数据结构学习笔记——ST表
    图论学习笔记——LCA
    基于CNN的手写数字识别程序
    [Atcoder]M-Solutions 题解
  • 原文地址:https://www.cnblogs.com/Caiyundo/p/12503931.html
Copyright © 2020-2023  润新知