• Python3爬虫(七) 解析库的使用之pyquery


     Infi-chu:

    http://www.cnblogs.com/Infi-chu/

    pyquery专门针对CSS和jQuery的操作处理

    1.初始化
    字符串初始化

    from pyquery import PyQuery as pq
    doc = pq(html)	# 传入html文本
    print(doc('li'))
    

    URL初始化

    from pyquery import PyQuery as pq
    doc = pq(url='www.baidu.com')
    print(doc('title'))
    # 另一种方法
    from pyquery import PyQuery as pq
    import requests
    doc = pq(requests.get('http://www.baidu.com'))
    print(doc('title'))
    

    文件初始化

    from pyquery import PyQuery as pq
    doc = pq(filename='text.html')
    print(doc('li'))
    

    2.基本CSS选择器

    from pyquery import PyQuery as pq
    doc = pq(url='http://www.baidu.com')
    print(doc(#head .head_wrapper a))
    print(type(doc(#head .head_wrapper a)))
    

    3.查找节点
    子节点

    from pyquery import PyQuery as pq
    doc = pq(url='http://www.baidu.com')
    items = doc('.head_wrapper')
    print(type(items))
    print(items)
    lis = items.find('a')	# find()是查找符合条件的所有子孙节点,只查找子节点的可以使用children()
    print(type(lis))
    print(lis)
    

    父节点
    使用parent()方法获取该节点的父节点
    使用parents()方法获取该节点的祖先节点

    兄弟节点
    使用siblings()方法获取兄弟节点

    4.遍历

    from pyquery import PyQuery as pq
    doc = pq(html)
    lis = doc('li').items()
    print(type(lis))
    for li in lis:
        print(li,type(li))
    

    5.获取信息
    获取属性
    使用attr()方法获取属性(值)

    from pyquery import PyQuery as pq
    doc = pq(url='http://www.baidu.com')
    items = doc('.head_wrapper')
    print(items.attr('href'))
    # 也可以写成
    print(items.attr.href)
    
    # 获取所有a的属性
    from pyquery import PyQuery as pq
    doc = pq(url='http://www.baidu.com')
    a = doc('a')
    for i in a:
        print(i.attr.href)
    

    获取文本
    使用text()方法获取纯文本纯字符串内容

    from pyquery import PyQuery as pq
    doc = pq(url = 'http://www.baidu.com')
    a = doc('a')
    print(i.text())    # 无需遍历
    

    使用html()方法保留标签内部的东西

    from pyquery import PyQuery as pq
    doc = pq(url = 'http://www.baidu.com')
    a = doc('a')
    for i in a:
        print(i)
        print(i.html())
    

    6.节点操作
    addClass和removeClass

    from pyquery import PyQuery as pq
    html = '''
    <div class="wrap">
    <div id="container">
    <ul class="list">
    <li class="item-0">first item</li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-0 active"><a href="link3.html"><span class"bold">third item</span></a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    <li class="item-0 active"><a href="link5.html">fifth item</a></li>
    </ul>
    </div>
    </div>
    '''
    doc = pq(html)
    li = doc('.item-0 active')
    print(li)
    li.removeClass('active')
    print(li)
    li.addClass('active')
    print(li)
    

    attr、text和html

    from pyquery import PyQuery as pq
    html = '''
    <div class="div">
    <p>ASD</p>
    <ul class="list">
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    </ul>
    </div>
    '''
    doc = pq(html)
    li = doc('.item-0 active')
    print(li)
    li.attr('name','link')
    print(li)
    li.text('changed item')
    print(li)
    li.html('<span>changed item</span>')
    print(li)
    

    remove()

    from pyquery import PyQuery as pq
    doc = pq(html)
    res = doc('.div')
    print(res.find('ul').remove().text())
    

    7.伪类选择器
    待完善

  • 相关阅读:
    lr 增强窗格中,如何生成调试信息?
    lr 自带的例子,如何进行关联,通过代码的函数进行实现
    lr11 录制脚本时候,无法自动启动ie,查了网上很多方法都未解决?
    loadrunner11 录制脚步不成功,在录制概要出现“No Events were detected”,浮动窗口总是显示“0 Events”,解决办法
    loadrunner11 安装及破解教程来自百度文库
    安装loadrunner11 ,出现如下错误如何解决?
    回收站数据删除了,如何进行恢复?
    网管工作方面——————打印机删除了然后开机重启他依然存在,如何解决
    Windows 不能在 本地计算机 启动 SQL Server 服务 错误代码126
    Sorry, the page you are looking for is currently unavailable. Please try again later. Nginx
  • 原文地址:https://www.cnblogs.com/Infi-chu/p/8986379.html
Copyright © 2020-2023  润新知