111.requests.get 方法的流程
r = requests.get('https://www.baidu.com/').content.decode('utf-8')
从状态码到 二进制码到 utf-8 编码
112.对 soup 对象进行美化
html = soup.prettify()
<title>
百度一下,你就知道
</title>
113.将内容 string 化
html.xpath('string(//*[@id="cnblogs_post_body"])')
114.获取属性
soup.p['name']
115.嵌套选择
soup.head.title.string
116.获取父节点和祖孙节点
soup.a.parent
list(enumerate(soup.a.parents))
117.获取兄弟节点
soup.a.next_siblings
list(enumerate(soup.a.next_siblings))
soup.a.previous_siblings
list(enumerate(soup.a.previous_siblings))
118.按照特定值查找标签
查找 id 为 list-1 的标签
soup.find_all(attrs={'id': 'list-1'})
soup.find_all(id='list-1')
119.返回父节点
find_parents()返回所有祖先节点
find_parent()返回直接父节点
120.返回后面兄弟节点
find_next_siblings()返回后面所有兄弟节点
find_next_sibling()返回后面第一个兄弟节点。
121.返回前面兄弟节点
find_previous_siblings()返回前面所有兄弟节点
find_previous_sibling()返回前面第一个兄弟节点。
122.返回节点后符合条件的节点
find_all_next()返回节点后所有符合条件的节点
find_next()返回第一个符合条件的节点
123.返回节点前符合条件的节点
find_all_previous()返回节点前所有符合条件的节点
find_previous()返回第一个符合条件的节点
124.requests 的请求方式
requests.post(url)
requests.put(url)
requests.delete(url)
requests.head(url)
requests.options(url)
125.GET请求
response = requests.get(url)
print(response.text)
126.解析 json
response.json()
json.loads(response.text)
127.发送 post 请求
response = requests.post(url, data=data, headers=headers)
response.json()
128.文件上传
在 post 方法内部添加参数 files 字典参数
import requests
files = {'file': open('favicon.ico', 'rb')}
response = requests.post("http://httpbin.org/post", files=files)
print(response.text)
129.获取 cookie
response.cookie
返回值是 字典对象
for key, value in response.cookies.items():
print(key + '=' + value)
130.模拟登录
requests.get('http://httpbin.org/cookies/set/number/123456789')
response = requests.get('http://httpbin.org/cookies')
131.带有 Session 的登录
s = requests.Session()
s.get('http://httpbin.org/cookies/set/number/123456789')
response = s.get('http://httpbin.org/cookies')
132.证书验证
urllib3.disable_warnings()
response = requests.get('https://www.12306.cn', verify=False)
response = requests.get('https://www.12306.cn', cert=('/path/server.crt', '/path/key'))
133.超时设置
from requests.exceptions import ReadTimeout
response = requests.get("http://httpbin.org/get", timeout = 0.5)
response = urllib.request.urlopen(url, timeout=1)
134.认证设置
from requests.auth import HTTPBasicAuth
r = requests.get('http://120.27.34.24:9001', auth=HTTPBasicAuth('user', '123'))
r = requests.get('http://120.27.34.24:9001', auth=('user', '123'))
135.异常处理
超时 ReadTimeout
连接出错 ConnectionError
错误 RequestException
136.URL 解析
from urllib.parse import urlparse
result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
result = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme='https')
result = urlparse('http://www.baidu.com/index.html;user?id=5#comment',allow_fragments=False)
136.urllib.parse.urlunparse
data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']
print(urlunparse(data))
http://www.baidu.com/index.html;user?a=6#comment
137.合并 url
urllib.parse.urljoin
urljoin('http://www.baidu.com', 'FAQ.html')
http://www.baidu.com/FAQ.html
urljoin('www.baidu.com#comment', '?category=2')
www.baidu.com?category=2
2020-05-06