• pyquery解析库


    语法和jquey几乎一致

    安装

    conda install pyquery

    一、初始化

    标准用法

    from pyquery import PyQuery as pq
    import requests
    
    #
    r = requests.get(url='http://www.baidu.com')
    
    html_doc = pq(r.text)
    print(html_doc)
    print(html_doc('#u1 a'))

    1、字符串初始化(最常用)

    from pyquery import PyQuery as pq
    
    html_doc = '''<div>
        <ul id = 'haha'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    print(doc)
    print(type(doc))

    2、url初始化

    from pyquery import PyQuery as pq
    
    #
    
    
    html_doc = pq(url='http://www.baidu.com')
    print(html_doc)
    print(html_doc('#u1 a'))

    注意:一般通过requests模块或urllib获取网页的html->解析模块去解析

    3、文件初始化

    from pyquery import PyQuery as pq
    
    #
    
    
    doc = pq(filename='test.html')
    print(doc)

    二、基本CSS选择器

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    li_list = doc('div #con li')
    print(li_list)
    
    # id      #
    # class  .
    # tag    tagname

    三、查找节点

    1、子节点

    find() 最常用的方法

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    div = doc('div')
    li_list = div.find('li.active')
    print(li_list)

    children() 查找所有子节点,children('') 查找指定的子节点

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    div = doc('div')
    # 查找所有子节点
    selector = div.children()
    print(selector)
    # 查找含有item-0类的节点
    li_item_0 = div.children('#con .item-0')
    print(li_item_0)

    2、父节点

    parent() 父节点 parents() 祖节点 parents('') 含有某些选择器祖节点

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # s所有li节点
    li_list = doc('#con li')
    # li节点的父节点
    ul = li_list.parent()
    # print(ul)
    # 祖辈节点(包含父节点)
    divs = li_list.parents()
    # print(divs)
    # 含有id="conn" 的祖节点
    div = li_list.parents('#con')
    print(div)

    3、兄弟节点

    siblings() 所有兄弟姊妹节点,siblings('') 含有指定css选择器的兄弟节点

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 含有class="item-0 active"的节点
    li = doc('#con li.item-0.active')
    # 查找所有兄弟节点(除了自己本身)
    # print(li.siblings())
    # 查找含有指定css选择器的节点
    print(li.siblings('.item-1.active'))

    四、遍历

    1、单个节点

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 单个节点
    li = doc('#con li.item-0.active')
    print(li)

    2、多个节点

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 多个节点,使用items()->生成器
    li_lst = doc('#con li')
    for li in li_lst.items():
        print(li, end='')

    五、获取信息

    1、属性

    获取 设置

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 获取a标签的href属性
    a = doc('li.item-0.active a')
    print(a.attr('href'))
    # 设置属性
    a.attr('href', 'oj8k')
    print(a.attr('href'))

    2、文本

    text() html()

    获取 设置

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 获取text()
    li = doc('li.item-0.active')
    print(li.text())
    # 获取html()
    print(li.html())
    
    # 设置text()
    li.text('Hello World')
    print(li.text())
    # 设置html()
    li.html('<a>打我</a>')
    print(li.html())

    注意:与JQuery的区别,pyquery(),  html() 获取的是内部的html,不包含其本身

    六、操作DOM节点

    1、add_class()和remove_class()c

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 获取text()
    li = doc('li.item-0.active')
    print(li.text())
    # 获取html()
    print(li.html())
    
    # 设置text()
    li.text('Hello World')
    print(li.text())
    # 设置html()
    li.html('<a>打我</a>')
    print(li.html())

    2、remove()

    作用:删除节点

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 获取li节点
    li = doc('li.item-0.active')
    print(li)
    # 找到a节点,并删除a节点
    a = li('a')
    a.remove()
    print(li)

    七、伪类选择器

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 获取li节点
    li = doc('li.item-0.active')
    print(li)
    # 找到a节点,并删除a节点
    a = li('a')
    a.remove()
    print(li)
  • 相关阅读:
    How to run a batch file each time the computer loads Windows
    go.mod file not found in current directory or any parent directory; see 'go help modules'
    xshell 所选的用户密钥未在远程主机上注册;无法加载密钥
    群起Hadoop的一个错误
    ssh: connect to host hadoop102 port 22: No route to host
    VMware下centos7配置静态ip并解决ping不通百度的问题
    虚拟机CentOS 7 网络连接显示"以太网(ens33,被拔出)"
    Rust-线程:使用消息传递在线程间传送数据
    Rust-线程,使用线程同时运行代码
    Rust-智能指针:RefCell<T>和内部可变性模式
  • 原文地址:https://www.cnblogs.com/wt7018/p/11904944.html
Copyright © 2020-2023  润新知