• Python 解析Html


    XPath

    常用匹配规则:

    符号

    描述

    /

    从当前节点,选取子节点

    //

    从当前节点,选取子孙节点

    .

    选取当前节点

    ..

    选择当前节点的父节点

    @

    选择属性

    属性获取:

    from lxml import etree
    html = '<div><a class="du" href="http://www.baidu.com">百度</a></div>'
    parser = etree.HTML(html)
    result = parser.xpath('//a[@class="du"]/@href')
    print(result)
    View Code

    文本获取:

    from lxml import etree
    html = '<div><a class="du" href="http://www.baidu.com">百度</a></div>'
    parser = etree.HTML(html)
    result = parser.xpath('//a[@class="du"]/text()')
    print(result)
    View Code

    属性多值匹配:

    from lxml import etree
    html = '<div><a class="du baidu" href="http://www.baidu.com">百度</a></div>'
    parser = etree.HTML(html)
    result = parser.xpath('//a[contains(@class,"du")]/text()')
    print(result)
    View Code

    多属性匹配:

    from lxml import etree
    html = '<div><a name="item" class="du baidu" href="http://www.baidu.com">百度</a></div>'
    parser = etree.HTML(html)
    result = parser.xpath('//a[contains(@class,"du") and @name="item"]/text()')
    print(result)
    View Code

    按序选择:

    from lxml import etree
    html = """
            <li>item1</li>
            <li>item2</li>
            <li>item3</li>
            <li>item4</li>
            <li>item5</li>
    """
    parser = etree.HTML(html)
    result = parser.xpath('//li[1]/text()') #匹配第一个
    print(result)
    result = parser.xpath('//li[last()]/text()') #匹配最后一个
    print(result)
    result = parser.xpath('//li[position()<3]/text()') #匹配第一、第二个
    print(result)
    result = parser.xpath('//li[last()-2]/text()') #匹配倒数第三个
    print(result)
    View Code

    更多用法:http://www.w3school.com.cn/xpath/xpath_functions.asp

    Beautiful Soup

    节点选择器:

    from bs4 import BeautifulSoup
    html = """
            <div>
            <li class="d1">item1</li>
            <li class="d2">item2</li>
            <li class="d3">item3</li>
            <li class="d4">item4</li>
            <li class="d5">item5</li>
            </div>
    """
    soup = BeautifulSoup(html,'lxml')
    result = soup.div.children
    print(result)
    for value in result:
        print(value.string)
    View Code

    方法选择器:

    # find_all(name,attrs,recursive,text,**kwargs)
    from bs4 import BeautifulSoup
    html = """
            <div>
            <li class="d1">item1</li>
            <li class="d2">item2</li>
            <li class="d3">item3</li>
            <li class="d4">item4</li>
            <li class="d5">item5</li>
            </div>
    """
    soup = BeautifulSoup(html,'lxml')
    result = soup.find_all(name="div")
    for value in result:
        result = value.find_all(name="li",class_="d3")[0].get_text() # 等价于string
        print(result)
    View Code

    Css选择器:

    from bs4 import BeautifulSoup
    html = """
            <div>
            <li class="d1">item1</li>
            <li class="d2">item2</li>
            <li class="d3">item3</li>
            <li class="d4" name="d">item4</li>
            <li class="d5">item5</li>
            </div>
    """
    soup = BeautifulSoup(html,'lxml')
    result = soup.select('div li[name="d"]')
    for value in result:
        print(type(value))
        print(value.get_text())
    View Code

    Pyquery

    初始化

    字符串初始化:

    from pyquery import PyQuery as pq
    html = "<a href='http://www.baidu.com'>百度一下</a>"
    parser = pq(html)
    View Code

    URL初始化:

    from pyquery import PyQuery as pq
    parser = pq(url="http://www.baidu.com")
    print(parser)
    View Code

    文件初始化:

    from pyquery import PyQuery as pq
    parser = pq(filename="demo.html")
    print(parser)
    View Code

    查找节点

    Css选择器:

    html = """      <div class="qrcode-text" id="1">
                    我是div标签的文本
                    <p class="title">我是标题<a href="http://www.baidu.com">百度一下</a></p>
                    <p class="content">我是内容</p>
                    </div>
            """
    
    from pyquery import PyQuery as pq
    parser = pq(html)
    result = parser(".qrcode-text .title a")
    print(result)
    View Code

    children()  查找子节点

    find()      查找子孙节点

    parent()        查找父节点

    parents()   查找祖先节点

    siblings()    查找兄弟节点

    html = """      <body>
                    <div class="qrcode-text" id="1">
                    我是div标签的文本
                    <p class="title">我是标题<a class="du" href="http://www.baidu.com">百度一下</a></p>
                    <p class="content">我是内容
                    <span class="first">第一行</span>
                    </p>
                    </div>
                    </body>
            """
    
    from pyquery import PyQuery as pq
    parser = pq(html)
    result = parser(".content").children()
    print(result)
    result = parser.find("span")
    print(result)
    result = parser("span").parent()
    print(result)
    result = parser("span").parents("#1")
    print(result)
    result = parser(".title").siblings()
    print(result)
    用法

    获取信息

    获取属性  attr()

    内部文本  text()

    html文本  html()

    html = """      <body>
                    <div class="item_1"><span>1.</span>第一行</div>
                    <div class="item_2"><span>2.</span>第二行</div>
                    <div class="item_3"><span>3.</span>第三行</div>
                    </body>
            """
    
    from pyquery import PyQuery as pq
    parser = pq(html)
    result = parser("div")
    for value in result.items():
        print(value.attr("class"))
        print(value.text())
        print(value.html())
    用法

    节点操作

    对节点进行动态修改。

    removeClass()

    addClass()

    html = """      <body>
                    <div class="item_1"><span>1.</span>第一行</div>
                    <div class="item_2"><span>2.</span>第二行</div>
                    <div class="item_3"><span>3.</span>第三行</div>
                    </body>
            """
    
    from pyquery import PyQuery as pq
    parser = pq(html)
    result = parser("div")
    for n,value in enumerate(result.items(),1):
        value.removeClass(value.attr("class"))
        value.addClass(str(n))
        print(value)
    View Code

    attr()

    text()

    html = """      <body>
                    <div class="item_1"><span>1.</span>第一行</div>
                    <div class="item_2"><span>2.</span>第二行</div>
                    <div class="item_3"><span>3.</span>第三行</div>
                    </body>
            """
    
    from pyquery import PyQuery as pq
    parser = pq(html)
    result = parser("div")
    for n,value in enumerate(result.items(),1):
        value.attr(id=str(n))
        value.text('Hello World')
        print(value)
    View Code

    remove()

    html = """      <body>
                    Hello World!
                    <div class="item_1"><span>1.</span>第一行</div>
                    <div class="item_2"><span>2.</span>第二行</div>
                    <div class="item_3"><span>3.</span>第三行</div>
                    </body>
            """
    
    from pyquery import PyQuery as pq
    parser = pq(html)
    result = parser("body")
    value = result.remove("div")
    print(value.text())
    View Code

    更多用法:http://pyquery.readthedocs.io/en/latest/api.html

  • 相关阅读:
    java 新建文本并写入
    批处理 获取相同进程的所有 pid
    io.netty.handler.codec.DecoderException: javax.net.ssl.SSLHandshakeException: error:
    Linux下“/”和“~”的区别
    Error creating bean with name 'consoleConfig'
    2019 蓝桥杯省赛 B 组模拟赛 结果填空:马的管辖
    # Codeforces Round #663 (Div. 2)
    Codeforces Round #645 (Div. 2) A~D
    迷宫2 NC15196
    Codeforces Round #643 (Div. 2)
  • 原文地址:https://www.cnblogs.com/py-peng/p/12014687.html
Copyright © 2020-2023  润新知