一、BeautifulSoup模块
二、博文简介
三、过滤恶意标签
一、BeautifulSoup模块
pip install bs4 # 安装bs4
from bs4 import BeautifulSoup # 导入BeautifulSoup
二、博文简介
from bs4 import BeautifulSoup
content = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(content, 'html.parser')
overview = soup.text[0:9]
print(overview)
三、过滤恶意标签
from bs4 import BeautifulSoup
content = '<a href="http://example.com/">I linked to <i>example.com</i></a><div><img src=""></img>image</div><a>link</a><script>alert(123)</script>'
soup = BeautifulSoup(content, 'html.parser')
print(soup) # 这里带有script标签的脚本
for tag in soup.find_all():
if tag.name in ['script', 'link']:
tag.decompose()
print(soup) # 这里已经把带有script标签的脚本去掉了