1.1 BeautifulSoup介绍
1、BeautifulSoup作用
1、BeautifulSoup是一个模块,该模块用于接收一个HTML或XML字符串,然后将其进行格式化
2、之后遍可以使用他提供的方法进行快速查找指定元素,从而使得在HTML或XML中查找指定元素变得简单
2、安装
pip3 install beautifulsoup4
pip install lxml #lxml是一个比beautifulsoup4更强大的库(居然直接用pip就安装成功了)
3、lxml与html.parser比较
1. 两者都是把文本转成对象的方法,lxml是第三方库,但是性能好(生产用这个),html.parser 是python内置模块无需安装
2. soup = BeautifulSoup(response.text,features='lxml') #lxml是第三方库,但是性能好(生产用这个)
3. soup = BeautifulSoup(response.text,features='html.parser') # html.parser 是python内置模块无需安装
4、lxml结合BeautifulSoup举例
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a</a> <a class="c1" id="link2" name="ha">i am a</a> </body> </html> """ soup = BeautifulSoup(html_doc, features="lxml") #1、找到第一个a标签 tag1 = soup.find(name='a') #2、找到所有的a标签 tag2 = soup.find_all(name='a') #3、找到id=link2的标签 tag3 = soup.select('#link2') print(tag1) # <a class="c1" id="i1" name="ha">i am a</a> print(tag2) # [<a class="c1" id="i1" name="ha">i am a</a>, <a class="c1" id="link2" name="ha">i am a</a>] print(tag3) # [<a class="c1" id="link2" name="ha">i am a</a>]
1.2 BeautifulSoup常用方法
1、name,标签名称(tag.name)
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a</a> <a class="c1" id="link2" name="ha">i am a</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") tag = soup.find('a') # 找到第一个a标签 print(tag.name) # 获取标签名称(如果是a标签,name=a) tag.name = 'span' # 将获取的a标签变成span标签 print(soup) # <html><head><title>The Dormouse's story</title></head> # <body> # <span class="c1" id="i1" name="ha">i am a</span> # <a class="c1" id="link2" name="ha">i am a</a> # </body> # </html>
2、attr,标签属性(tag.attrs)
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a</a> <a class="c1" id="link2" name="ha">i am a</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") tag = soup.find('a') attrs = tag.attrs # 获取所有属性 print(attrs) # 格式:{'name': 'ha', 'class': ['c1'], 'id': 'i1'} tag.attrs = {'ik':123} # 将属性替换成 ik="123" tag.attrs['id'] = 'iiiii' # 在原来的基础上添加一个 id="iiiii"属性 print(soup) # <a id="iiiii" ik="123">
3、children,所有子标签
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a</a> <a class="c1" id="link2" name="ha">i am a</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") body = soup.find('body') v = body.children #找到所有孩子标签 for tag in v: print(tag) # <a class="c1" id="i1" name="ha">i am a</a> # <a class="c1" id="link2" name="ha">i am a</a>
4、descendants,所有子子孙孙标签
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") body = soup.find('body') v = body.descendants #找到所有子子孙孙标签 for tag in v: print(tag) # <a class="c1" id="i1" name="ha">i am a1</a> # i am a1 # <a class="c1" id="link2" name="ha">i am a2</a> # i am a2
5、clear,将标签的所有子标签全部清空(保留标签名)
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") tag = soup.find('body') tag.clear() # 结果仅保留了body这个标签名,其他全部删除了 print(soup) # <html><head><title>The Dormouse's story</title></head> # <body></body> # </html>
6、decompose,递归的删除所有的标签
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") body = soup.find('body') body.decompose() # 结果将body标签都删除了,不保留body这个标签名 print(soup) # <html><head><title>The Dormouse's story</title></head> # </html>
7、extract,递归的删除所有的标签,并获取删除的标签
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") body = soup.find('body') v = body.extract() print(v) # v就是删除的body标签的内容 # <body> # <a class="c1" id="i1" name="ha">i am a1</a> # <a class="c1" id="link2" name="ha">i am a2</a> # </body> print(soup) # soup是将body标签删除后的内容,还保留body这个空标签 # <html><head><title>The Dormouse's story</title></head> # </html>
8、find,获取匹配的第一个标签
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") tag = soup.find('a') print(tag) # <a class="c1" id="i1" name="ha">i am a1</a> # recursive=False那么只会到儿子里去查找,不会到子子孙孙查找 # text='Lacie' 文本必须是这样的 tag1 = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') tag2 = soup.find(name='a', class_='sister', recursive=True, text='Lacie') print(tag1) # None print(tag2) # None
9、find_all,获取匹配的所有标签
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") #1、找到所有a标签 tags = soup.find_all('a') print(tags) # [<a class="c1" id="i1" name="ha">i am a1</a>, <a class="c1" id="link2" name="ha">i am a2</a>] #2、limit=1限制只找一个 tags = soup.find_all('a',limit=1) # limit限制只找多少个 print(tags) # [<a class="c1" id="i1" name="ha">i am a1</a>] #3、找到所有的a标签和div标签 v = soup.find_all(name=['a','div']) print(v) # [<a class="c1" id="i1" name="ha">i am a1</a>, <a class="c1" id="link2" name="ha">i am a2</a>] #4、找到所欲class名为:'sister0', 'sister' v = soup.find_all(class_=['sister0', 'sister']) print(v) # [] #5、找到所有id='link1'或id='link2'的标签 # v = soup.find_all(href=['link1','link2']) # 同理 v = soup.find_all(id=['link1','link2']) print(v) # [<a class="c1" id="link2" name="ha">i am a2</a>] # ####### 正则 ####### import re rep = re.compile('p') rep = re.compile('^p') #找到所有以p开头的标签 v = soup.find_all(name=rep) rep = re.compile('sister.*') v = soup.find_all(class_=rep) print(v) rep = re.compile('http://www.oldboy.com/static/.*') v = soup.find_all(href=rep) print(v) # ####### 方法筛选 ####### def func(tag): return tag.has_attr('class') and tag.has_attr('id') v = soup.find_all(name=func) print(v) # get,获取标签属性 tag = soup.find('a') v = tag.get('id') print(v)
10、decode,转换为字符串(含当前标签);decode_contents(不含当前标签)
作用:将body这个对象转换成字符串类型
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") body = soup.find('body') #1、包含body这个标签 v1 = body.decode() #v1包含body这个标签 print(v1) # <body> # <a class="c1" id="i1" name="ha">i am a1</a> # <a class="c1" id="link2" name="ha">i am a2</a> # </body> #2、不包含body这个标签 v2 = body.decode_contents() #v2不包含body这个标签 print(v2) # <a class="c1" id="i1" name="ha">i am a1</a> # <a class="c1" id="link2" name="ha">i am a2</a>
11、encode,转换为字节(含当前标签);encode_contents(不含当前标签)
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") body = soup.find('body') #1、包含body这个标签 v1 = body.encode() #v1包含body这个标签(字节格式) print(v1) # b'<body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body>' #2、不包含body这个标签 v2 = body.encode_contents() #v2不包含body这个标签(字节格式) print(v2) # b' <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> '
12、has_attr,检查标签是否具有该属性
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") tag = soup.find('a') v = tag.has_attr('id') print(v) # True
13、get_text,获取标签内部文本内容
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") tag = soup.find('a') v = tag.get_text('id') print(v) # i am a1
14、index,检查标签在某标签中的索引位置
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") tag = soup.find('body') v = tag.index(tag.find('a')) print(v) # 1
15、is_empty_element,是否是空标签(是否可以是空)或者自闭合标签
作用:判断是否是如下标签:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") tag = soup.find('a') print(tag) # <a class="c1" id="i1" name="ha">i am a1</a v = tag.is_empty_element print(v) # False
16、append在当前标签内部追加一个标签
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") tag = soup.find('body') #1、找到指定第一个a标签追加到最后 tag.append(soup.find('a')) print(soup) # <html><head><title>The Dormouse's story</title></head> # <body> # <a class="c1" id="link2" name="ha">i am a2</a> # <a class="c1" id="i1" name="ha">i am a1</a></body> # </html> #2、创建一个a标签追加到末尾 from bs4.element import Tag obj = Tag(name='i',attrs={'id': 'it'}) #创建一个i标签,并设置属性 obj.string = '我是一个新来的' #给他这个创建的i标签添加内容 tag = soup.find('body') tag.append(obj) #将创建的a标签添加到body中 print(soup) # <html><head><title>The Dormouse's story</title></head> # <body> # <a class="c1" id="link2" name="ha">i am a2</a> # <a class="c1" id="i1" name="ha">i am a1</a><i id="it">我是一个新来的</i></body> # </html>
17、insert在当前标签内部指定位置插入一个标签
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") #1、创建一个新标签插入到第二号位置 from bs4.element import Tag obj = Tag(name='i', attrs={'id': 'it'}) obj.string = '我是一个新来的' tag = soup.find('body') tag.insert(2, obj) print(soup) # <html><head><title>The Dormouse's story</title></head> # <body> # <a class="c1" id="i1" name="ha">i am a1</a> # <i id="it">我是一个新来的</i> # <a class="c1" id="link2" name="ha">i am a2</a> # </body> # </html>
18、insert_after,insert_before 在当前标签后面或前面插入
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") from bs4.element import Tag obj = Tag(name='i', attrs={'id': 'it'}) obj.string = '我是一个新来的' tag = soup.find('a') # tag.insert_before(obj) #1、创建一个i标签,追加到找到的第一个a标签的后面 tag.insert_after(obj) print(soup) # <html><head><title>The Dormouse's story</title></head> # <body> # <a class="c1" id="i1" name="ha">i am a1</a> # <i id="it">我是一个新来的</i> # <a class="c1" id="link2" name="ha">i am a2</a> # </body> # </html>
19、replace_with 将当前标签替换为指定标签
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") #1、使用新创建的i标签替换找到的第一个a标签 from bs4.element import Tag obj = Tag(name='i', attrs={'id': 'it'}) obj.string = '我是一个新来的' tag = soup.find('a') tag.replace_with(obj) #用我们创建的i标签替换找到的div标签 print(soup)
20、wrap,将指定标签把当前标签包裹起来
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") from bs4.element import Tag obj1 = Tag(name='div', attrs={'id': 'it'}) obj1.string = '我是一个新来的' #1、创建一个div标签包裹住找到的第一个a标签 tag = soup.find('a') v = tag.wrap(obj1) #用创建的div标签包裹找到的a标签 print(soup) # <html><head><title>The Dormouse's story</title></head> # <body> # <div id="it">我是一个新来的<a class="c1" id="i1" name="ha">i am a1</a></div> # <a class="c1" id="link2" name="ha">i am a2</a> # </body> # </html>
21、unwrap,去掉当前标签,将保留其包裹的标签
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") #1、找到第一个a标签,并去除包裹的标签,所以就只剩下内容了 tag = soup.find('a') v = tag.unwrap() print(soup) # <html><head><title>The Dormouse's story</title></head> # <body> # i am a1 # <a class="c1" id="link2" name="ha">i am a2</a> # </body> # </html>
22、查找某标签的关联标签
tag.find_next(...) tag.find_all_next(...) tag.find_next_sibling(...) tag.find_next_siblings(...) tag.find_previous(...) tag.find_all_previous(...) tag.find_previous_sibling(...) tag.find_previous_siblings(...) tag.find_parent(...) tag.find_parents(...) # 参数同find_all
23、select,select_one, CSS选择器
soup.select("title") #找到title标签 soup.select("p nth-of-type(3)") soup.select("body a") #找到html head 中的title标签 soup.select("html head title") tag = soup.select("span,a") soup.select("head > title") soup.select("p > a") soup.select("p > a:nth-of-type(2)") soup.select("p > #link1") soup.select("body > a") soup.select("#link1 ~ .sister") soup.select("#link1 + .sister") soup.select(".sister") soup.select("[class~=sister]") soup.select("#link1") soup.select("a#link2") soup.select('a[href]') soup.select('a[href="http://example.com/elsie"]') soup.select('a[href^="http://example.com/"]') soup.select('a[href$="tillie"]') soup.select('a[href*=".com/el"]') from bs4.element import Tag def default_candidate_generator(tag): for child in tag.descendants: if not isinstance(child, Tag): continue if not child.has_attr('href'): continue yield child tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator) def default_candidate_generator(tag): for child in tag.descendants: if not isinstance(child, Tag): continue if not child.has_attr('href'): continue yield child tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1) print(type(tags), tags)
24、标签的内容(不仅可以读还修改)
tag = soup.find('span') # print(tag.string) # 获取span标签中的内容 tag.string = 'new content' # 将span标签中的内容改成'new content' print(soup) tag = soup.find('body') print(tag.string) tag.string = 'xxx' print(soup) tag = soup.find('body') v = tag.stripped_strings # 递归内部获取所有标签的文本 print(v)