• PyQuery用法详解


    PyQuery是强大而又灵活的网页解析库,如果你觉得正则写起来太麻烦,如果你觉得BeautifulSoup语法太难记,如果你熟悉jQuery的语法
    那么,PyQuery就是你绝佳的选择。
    
    一、初始化方式,有三种,可以传入字符串,传入url,传入文件。
    字符串初始化
    html = '''
    <div>
        <ul>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)#声明pq对象
    print(doc('li'))#用css选择器来实现,如果要选id前面加#,如果选class,前面加.,如果选标签名,什么也不加
    
    URL初始化
    也可以直接传入URL,进行URL初始化,程序会自动请求URL,获得html并返回要查找的字符串
    from pyquery import PyQuery as pq
    doc = pq(url='http://www.baidu.com')#程序会自动请求url
    print(doc('head'))#返回head标签
    
    文件初始化
    from pyquery import PyQuery as pq
    doc = pq(filename='D://demo.html')#直接传入文件名称及路径,程序会自动寻找并请求
    print(doc('li'))
    
    二、基本css选择器
    html = '''
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    print(doc('#container .list li'))#会查找id为container class为list,标签为li的对象,只是层级关系,没有后者一定是前者的子对象
    
    查找元素
    html = '''
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
    '''
    
    子元素
    from pyquery import PyQuery as pq
    doc = pq(html)
    items = doc('.list')#拿到items
    print(type(items))
    print(items)
    lis = items.find('li')#利用find方法,查找items里面的li标签,得到的lis也可以继续调用find方法往下查找,层层剥离
    print(type(lis))
    print(lis)
    
    也可以用.children()查找直接子元素
    lis = items.children()
    print(type(lis))
    print(lis)
    lis = items.children('.active')
    print(lis)
    
    父元素
    html = '''
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    items = doc('.list')
    container = items.parent()#.parent()查找对象的父元素
    print(type(container))
    print(container)
    
    祖先节点
    parents = items.parents()#.parents()祖先节点
    parent = items.parents('.wrap')#当然也可以传入参数
    print(parent)
    
    兄弟元素
    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
         </div>
     </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.list .item-0.active')#空格表示里面,没有空格表示整体
    print(li.siblings())#.siblings()兄弟元素,即同级别的元素,不包括自己
    
    三、遍历
    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
         </div>
     </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    lis = doc('li').items()#.items会是一个生成器
    print(type(lis))
    for li in lis:
        print(li)
    
    四、获取信息
    获取属性
    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
         </div>
     </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    a = doc('.item-0.active a')
    print(a)
    print(a.attr('href'))#定义a标签的href属性用于指定超链接目标的URL。 如果用户选择了a标签中的内容,那么浏览器会尝试检索并显示href属性指定的URL所表示的文档,或者执行JavaScript表达式、方法和函数的列表。
    print(a.attr.href)
    结果:
    <a href="link3.html"><span class="bold">third item</span></a>
    link3.html
    link3.html
    
    获取文本
    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
         </div>
     </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    a = doc('.item-0.active a')
    print(a)
    print(a.text)#.text()获取文本信息
    
    获取html
    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
         </div>
     </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.item-0.active')
    print(li)
    print(li.html())#.html()获取所在html
    
    五、DOM操作
    
    addClass、removeClass
    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
         </div>
     </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.item-0.active')
    print(li)
    li.removeClass('active')#删除
    print(li)
    li.addClass('active')#增加
    print(li)
    
    attr、css
    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
         </div>
     </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.item-0.active')
    print(li)
    li.attr('name', 'link')#增加一个属性
    print(li)
    li.css('font-size', '14px')#增加一个css
    print(li)
    结果:
    < li class ="item-0 active" > < a href="link3.html" > < span class ="bold" > third item < / span > < / a > < / li >
    < li class ="item-0 active" name="link" > < a href="link3.html" > < span class ="bold" > third item < / span > < / a > < / li >
    < li class ="item-0 active" name="link" style="font-size: 14px" > < a href="link3.html" > < span class ="bold" > third item < / span > < / a > < / li >
    
    remove
    html = '''
    <div class="wrap">
        Hello, World
        <p>This is a paragraph.</p>
     </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    wrap = doc('.wrap')
    print(wrap.text())
    wrap.find('p').remove()#找到p标签然后删除
    print(wrap.text())
    结果:
    Hello, World This is a paragraph.
    Hello, World
    
    其他DOM方法
    http://pyquery.readthedocs.io/en/latest/api.html
    
    六、伪类选择器
    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
         </div>
     </div>
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('li:first-child')
    print(li)
    li = doc('li:last-child')
    print(li)
    li = doc('li:nth-child(2)')
    print(li)
    li = doc('li:gt(2)')
    print(li)
    li = doc('li:nth-child(2n)')
    print(li)
    li = doc('li:contains(second)')
    print(li)
    结果:
    < li class ="item-0" > first item < / li >
    < li class ="item-0" > < a href="link5.html" > fifth item < / a > < / li >
    < li class ="item-1" > < a href="link2.html" > second item < / a > < / li >
    < li class ="item-1 active" > < a href="link4.html" > fourth item < / a > < / li >
    < li class ="item-0" > < a href="link5.html" > fifth item < / a > < / li >
    < li class ="item-1" > < a href="link2.html" > second item < / a > < / li >
    < li class ="item-1 active" > < a href="link4.html" > fourth item < / a > < / li >
    < li class ="item-1" > < a href="link2.html" > second item < / a > < / li >
    
    更多CSS选择器可以查看 http://www.w3school.com.cn/css/index.asp
    
    官方文档
    http://pyquery.readthedocs.io/
  • 相关阅读:
    强化学习快速入门
    Spark GraphX图计算简单案例【代码实现,源码分析】
    CDA数据分析【第二章:数据收集与导入】
    CDA数据分析实务【第一章:营销决策分析概述】
    CDA数据分析【第一章:数据分析概述】
    BLAS快速入门
    Tachyon内存文件系统快速入门
    Solr新特性【4.x,5.x,6.x,7.x】
    利用Redis keyspace notification(键空间通知)实现过期提醒
    设计模式 行为型
  • 原文地址:https://www.cnblogs.com/themost/p/6903742.html
Copyright © 2020-2023  润新知