1.了解
pyquery库是jQuery的Python实现,能够以jQuery的语法来操作解析 HTML 文档,易用性和解析速度都很好。
2.安装
pip install pyquery
3引用
from pyquery import PyQuery as pq
4.初始化
1)字符串
html = """ <html lang="en"> <head> simple good <title>PyQuery</title> </head> <body> <ul id="container" class="list"> <li class="object-1">Python</li> <li class="object-2">amazing</li> <li class="object-3">wonderful</li> </ul> </body> </html> """ doc = pq(html)
2)url
response = pq(url='https://www.baidu.com') print(response("head"))
3)文件
#filename参数为html文件路径 test_html = pq(filename = 'test.html') print(type(test_html)) print(test_html)
5.使用
#-*- coding: UTF-8 -*- from pyquery import PyQuery as pq html = """ <html lang="en"> <head> simple good <title>PyQuery</title> </head> <body> <ul id="container" class="list"> <li class="object-1">Python</li> <li class="object-2">amazing</li> <li class="object-3">wonderful</li> </ul> </body> </html> """ doc = pq(html) #常用的css选择器 print(doc) #打印id为container的标签 print(doc('#container')) #打印class为object-1的标签 print(doc('.object-1')) #打印body print(doc('body')) #多种css选择器使用 print(doc('html #container')) #打印.list的li print(doc('#container .list li')) print('-----------------------------------') #伪类选择器 #打印第二个孩子 print(doc('li:nth-child(2)')) #打印第一个孩子 print(doc('li:first-child')) #打印最后一个孩子 print(doc('li:last-child')) #打印含Python的li print(doc("li:contains('Python')")) print('-----------------------------------') #查找 #查找id为container print(doc.find('#container')) #查找li print(doc.find('li')) #查找id为container的孩子 print(doc.find('#container').children()) #查找类为object-2的父亲 print(doc.find('.object-2').parent()) #查找类为object-2的兄弟姐妹 print(doc.find('.object-2').siblings()) print('-----------------------------------') #获取标签属性 #获取attr为class print(doc.find('.object-2').attr('class')) #标签内的文本 print(doc.find('.object-1').text()) #去掉li标签 print(doc.find('#container').remove('li').text()) print('----------------------------------')
输出如下:
<html lang="en"> <head> simple good <title>PyQuery</title> </head> <body> <ul id="container" class="list"> <li class="object-1">Python</li> <li class="object-2">amazing</li> <li class="object-3">wonderful</li> </ul> </body> </html> <ul id="container" class="list"> <li class="object-1">Python</li> <li class="object-2">amazing</li> <li class="object-3">wonderful</li> </ul> <li class="object-1">Python</li> <body> <ul id="container" class="list"> <li class="object-1">Python</li> <li class="object-2">amazing</li> <li class="object-3">wonderful</li> </ul> </body> <ul id="container" class="list"> <li class="object-1">Python</li> <li class="object-2">amazing</li> <li class="object-3">wonderful</li> </ul> ----------------------------------- <li class="object-2">amazing</li> <li class="object-1">Python</li> <li class="object-3">wonderful</li> <li class="object-1">Python</li> ----------------------------------- <ul id="container" class="list"> <li class="object-1">Python</li> <li class="object-2">amazing</li> <li class="object-3">wonderful</li> </ul> <li class="object-1">Python</li> <li class="object-2">amazing</li> <li class="object-3">wonderful</li> <li class="object-1">Python</li> <li class="object-2">amazing</li> <li class="object-3">wonderful</li> <ul id="container" class="list"> <li class="object-1">Python</li> <li class="object-2">amazing</li> <li class="object-3">wonderful</li> </ul> <li class="object-1">Python</li> <li class="object-3">wonderful</li> ----------------------------------- object-2 Python ----------------------------------