0. 安装及导入
- 安装
pip install beautifulsoup4
- 导入
from bs4 import BeautifulSoup
- 如果选择
lxml
解析器的话还需要安装pip install lxml
,这个解析器的优点是效率更高
1. 访问结构化数据
假设我们有下面这段 HTML 代码,
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
那么用 BeautifulSoup 解析后就可以得到结构化的输出。
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.prettify())
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title">
<b>
The Dormouse's story
</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
Elsie
</a>
,
<a class="sister" href="http://example.com/lacie" id="link2">
Lacie
</a>
and
<a class="sister" href="http://example.com/tillie" id="link3">
Tillie
</a>
;
and they lived at the bottom of a well.
</p>
<p class="story">
...
</p>
</body>
</html>
然后,我们可以直接通过 HTML 中标签的名字来对其进行访问
print(soup.title) # 直接访问某一个标签 <title>The Dormouse's story</title>
print(soup.title.name) # 标签的名字 title
print(soup.title.string) # 标签的文字内容 The Dormouse's story
print(soup.title.text) # 标签的文字内容 The Dormouse's story
print(soup.title.parent.name) # 标签的父标签名字 head
.string
和 .text
的不同:
- 如果一个标签下面还有子标签,
.string
不知道返回哪一个所以返回 None,而.text
则返回该标签及其所有子标签的文字。我们以上面例子的第二个标签为例,
print(soup.find(class_='story').string) # None
print(soup.find(class_='story').text)
# Once upon a time there were three little sisters; and their names were
# Elsie,
# Lacie and
# Tillie;
# and they lived at the bottom of a well.
- 如果一个标签下面有换行符
,那么.string
也会返回 None,而.text
可以正确返回该标签的文字。我们对原始的 HTML 文档中其中一句修改为<p class="title"><b>The Dormouse's <br /> story</b></p>
,
print(soup.find(class_='title').string) # None
print(soup.find(class_='title').text) # The Dormouse's story
如果有多个同名标签则返回第一个,并且还支持访问标签的属性。
print(soup.p) # <p class="title"><b>The Dormouse's story</b></p>
print(soup.p['class']) # ['title']
print(soup.a['href']) # http://example.com/elsie
print(soup.a['id']) # link1
print(soup.a.attrs) # {'class': ['sister'], 'id': 'link1', 'href': 'http://example.com/elsie'}
如果一个标签下面还有子标签,也可以对它们进行遍历访问。
print(soup.p.b) # <b>The Dormouse's story</b>
print(soup.p.b.text) # The Dormouse's story
2. find_all() 方法
find_all()
方法可以根据字符串、标签属性、方法等来进行搜索
- 输入字符串则会查找到和字符串相等的标签
print(soup.find_all('a')) # 查找所有的标签 <a>,以列表方式返回
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
print(soup.find_all(['head', 'b'])) # 查找标签 <head> 和 <b>
# [<head><title>The Dormouse's story</title></head>, <b>The Dormouse's story</b>]
- 输入一个方法则会找到返回值为 True 的的标签
def has_class_and_id(tag):
return tag.has_attr('class') and tag.has_attr('id')
# 查找同时具有 class 和 id 属性的标签
print(soup.find_all(has_class_and_id))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
- 输入标签属性则会返回满足条件的的标签
print(soup.find(id='link2')) # 查找 id='link2' 的标签
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
def not_link2(id):
return id and not id == 'link2'
# 查找具有 id 属性且其值不等于 'link2' 的标签
print(soup.find_all(id=not_link2))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
- 输入正则表达式则会根据 search() 方法来匹配,注意!!!
re.compile("t").search()
不是从头开始匹配的
for tag in soup.find_all(re.compile("t")): # 查找包含 t 的标签
print(tag.name)
# html
# title
- 其它常见用法
print(soup.find_all("p", "title")) # 查找有 title 的 <p> 标签
# [<p class="title"><b>The Dormouse's story</b></p>]
# 下面四个的输出结果相同
print(soup.find_all(id=True)) # 查找有 id 属性的标签
print(soup.find_all("a", class_="sister")) # 查找有属性 class="sister" 的 <a> 标签
def has_six_characters(css_class):
return css_class is not None and len(css_class) == 6
# 查找属性 class 有六个字符的标签
print(soup.find_all(class_=has_six_characters))
print(soup("a")) # 直接调用
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
3. find() 方法
find()
方法与find_all()
方法类似,但是只返回查找到的第一个结果
print(soup.find("a", class_="sister"))
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print(soup.find("a", class_="sister").get('href')) # 获取标签的属性值
# http://example.com/elsie
4. CSS 选择器
通过在 Tag 或 BeautifulSoup 对象的 .select()
方法中传入字符串参数,即可使用CSS 选择器的语法找到某一个标签
print(soup.select('title')) # 找到 <title> 标签
# [<title>The Dormouse's story</title>]
print(soup.select("p:nth-of-type(1)")) # 找到第一个 <p> 标签
# [<p class="title"><b>The Dormouse's story</b></p>]
print(soup.select("html body p b")) # 通过标签逐层查找
[<b>The Dormouse's story</b>]
print(soup.select('p > b')) # 查找直接子标签
# [<b>The Dormouse's story</b>]
print(soup.select('body > a'))
# None
print(soup.select('p > #link1'))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
# 还可以通过类名进行查找
print(soup.select('.title'))
print(soup.select("[class~=title]"))
# [<p class="title"><b>The Dormouse's story</b></p>]
print(soup.select_one(".sister")) # 返回查找到的第一个
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
# 还可以通过标签的属性进行查找
print(soup.select("#link1"))
print(soup.select('a[id="link1"]'))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
print(soup.select('a[id]'))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
获取更多精彩,请关注「seniusen」!