crawl——bs4的搜索文档树

概要

代码

from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" id='id_pp' name='lqz'>asdfasdf<b>asdfas</b><span>span<b>bbb</b></span></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister1" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup=BeautifulSoup(html_doc,'lxml')

# find和find_all的用法：用法完全一样，只不过find找到第一个，find_all找到所有

# 5种过滤器：字符串、正则表达式、列表、True、方法

# 字符串：name:标签名   class_:类名  id：id号  href：href
# 只要是BeautifulSoup对象Tag的对象，可以继续find，继续遍历 . 找
# res=soup.find(name='body').p
# res=soup.find(name='body').find(name='p')
# print(type(res))
# print(res)

# res=soup.body.find(id='link2')
# res=soup.body.find(href='http://example.com/lacie')
# res=soup.body.find(name='a',href='http://example.com/lacie')
# print(res)


# 列表

# res=soup.find_all(name=['a','p'])
# res=soup.find_all(id=['link2','link3'],class_='sister')
# print(res)


# 正则表达式
# import re
# # res=soup.find_all(name=re.compile('^b'))
# res=soup.find_all(class_=re.compile('^s'),name='a')
# print(res)

# True
# res=soup.find_all(name=True)
# res=soup.find_all(class_=True)
# res=soup.find_all(id=True)

# res=soup.find_all(href=True)
# for i in res:
#     url=i['href']
#     print(url)

# print(res)



# 方法/函数(了解)
# def aaa(tag):
#     # return tag.has_attr('class') and not tag.has_attr('id')
#     return tag.has_attr('class') and  tag.has_attr('id')
#
# res=soup.find_all(name=aaa)
# print(res)

相关阅读:
Python中的赋值与深浅拷贝
Python面试题解析之前端、框架和其他
Python面试题解析之数据库与缓存
Python面试题解析之网络编程与并发
Python面试题解析之Python基础篇
2、使用rpm包安装grafana
1、在Centos上安装Grafana
MySQL所学所思所想
运维感悟(信息大爆炸的时代，该学习什么来保持着我们的竞争力)
C#.NET 中的 Timer 计时器及 3 种使用方法

原文地址：https://www.cnblogs.com/guojieying/p/14309219.html