aiohttp支持异步操作的网络请求的模块
1.一个简单异步协程爬取
- read()
- text(encoding=编码) 比如:await r.text(encoding="utf-8")
import asyncio
import aiohttp
async def request(url):
print("当前url:",url)
#使用aiohttp发起request请求。
async with aiohttp.request("GET",url) as r:
#r.read()不变吗,直接读取。返回来是二进制文件
reponse = await r.read()
print("返回reponse:",reponse)
urls = [
'https://www.baidu.com',
'https://www.sogou.com',
'https://www.qq.com',
]
#任务列表,存放多个任务对象
stasks=[]
for url in urls:
c = request(url)
task = asyncio.ensure_future(c)
stasks.append(task)
loop = asyncio.get_event_loop()
#需要将任务列表封装到wait中
loop.run_until_complete(asyncio.wait(stasks))
2.发起session请求
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
Xu Junkai
"""
import requests
import asyncio
import time
import aiohttp
start_time = time.time()
urls = [
'https://blog.csdn.net/',
'https://www.sogou.com',
'http://www.renren.com/',
]
async def get_page(url):
print(url)
async with aiohttp.ClientSession() as session:
async with session.get(url) as res:
print(res.status)#获取相应状态码
print(res.charset)#获取网页编码
reponse = await res.text()#获取返回文本
print(reponse)
tasks=[]
for url in urls:
c = get_page(url)
task = asyncio.ensure_future(c)
tasks.append(task)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
end_time = time.time()
print('总耗时:',end_time-start_time)
- session.put
async with session.put(url,data=b"data")
注意:
不要为每次的连接都创建一次session,一般情况下只需要创建一个session,然后使用这个session执行所有的请求。
每个session对象,内部包含了一个连接池,并且将会保持连接和连接复用(默认开启)可以加快整体的性能
3.url中传递参数
import asyncio
import time
import aiohttp
start_time = time.time()
urls = [
'https://blog.csdn.net/',
'https://www.sogou.com',
'http://www.renren.com/',
]
data = {"name":"foo"}
async def get_page(url,data):#定义函数可以放入多个参数
print(url)
async with aiohttp.ClientSession() as session:
async with session.get(url,params= data) as res:
print(res.status)
#获取响应内容(由于获取响应内容是一个阻塞耗时过程,所以我们使用await实现协程切换)
reponse = await res.text()
print(reponse)
print(res.charset)
tasks=[]
for url in urls:
c = get_page(url,data)#传入参数,但不会执行
task = asyncio.ensure_future(c)
tasks.append(task)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
end_time = time.time()
print('总耗时:',end_time-start_time)
注意
当使用res.text(),res.read()获取响应内容(由于获取响应内容是一个阻塞耗时过程,所以我们使用await实现协程切换)
正确写法
await res.text()
await res.read() #获取是字节
await res.json() 可以设置编码,设置处理函数
注意:
res.json()为Requests中内置的JSON解码器
其中只有response返回为json格式时,用res.json()打印出响应的内容.
如果response返回不为json格式,使用res.json()会报错
4.StreamResponse
- 因为text(),read()方法是把整个响应体读入内存,如果你是获取大量的数据,请考虑使用”字节流“(StreamResponse)
#字节流形式获取数据
import asyncio
import aiohttp
urls ='https://blog.csdn.net/'
async def get_page(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as res:
#打印100个字节的数据
print(await res.content.read(100))
c = get_page(urls,)#函数对象
task = asyncio.ensure_future(c)#放入ensure_future中
loop = asyncio.get_event_loop()#创建循环事件
loop.run_until_complete(task)
#获取100个字节数据
- 字节流形式读取数据,保存文件
import asyncio
import aiohttp
urls ='https://blog.csdn.net/'
async def get_page(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as res:
with open("cnds.text","wb") as fp:
#循环,100个字节100个字节读取放入文件中
while True:
chunk = await res.content.read(100)
if not chunk:
break
fp.write(chunk)
c = get_page(urls,)
task = asyncio.ensure_future(c)
loop = asyncio.get_event_loop()
loop.run_until_complete(task)
注意
async with session.get(url) as res:#异步上下文管理器
with open("cnds.text","wb") as fp:#普通上下文管理器
#因为异步上下文管理器在enter和exit方法处能够暂停执行上下文管理器
#为了实现此功能,加入了2个新方法:__aenter__ 和__aexit__这两个方法都要返回一个 awaitable类型的值。
详见:
https://www.jb51.net/article/163540.htm
异步迭代器
5.自定义请求头
#与requests方法一样,headers放User-agent比较多。
async def get_page(url):
async with aiohttp.ClientSession() as session:
headers = {'Content-Type':'text/html; charset=utf-8'}
async with session.get(url,headers=headers) as res:
with open("cnds.text","wb") as fp:
#循环,100个字节100个字节读取放入文件中
while True:
chunk = await res.content.read(100)
if not chunk:
break
fp.write(chunk)
6.自定义cookie
- 注意:对于自定义cookie,我们需要设置在ClientSession(cookies=自定义cookie字典),而不是session.get()中
#源码显示
class ClientSession:
"""First-class interface for making HTTP requests."""
ATTRS = frozenset([
'_source_traceback', '_connector',
'requote_redirect_url', '_loop', '_cookie_jar',
'_connector_owner', '_default_auth',
'_version', '_json_serialize',
'_requote_redirect_url',
'_timeout', '_raise_for_status', '_auto_decompress',
'_trust_env', '_default_headers', '_skip_auto_headers',
'_request_class', '_response_class',
'_ws_response_class', '_trace_configs'])
_source_traceback = None
_connector = None
def __init__(self, *, connector: Optional[BaseConnector]=None,
loop: Optional[asyncio.AbstractEventLoop]=None,
cookies: Optional[LooseCookies]=None,
headers: Optional[LooseHeaders]=None,
skip_auto_headers: Optional[Iterable[str]]=None,
auth: Optional[BasicAuth]=None,
json_serialize: JSONEncoder=json.dumps,
request_class: Type[ClientRequest]=ClientRequest,
response_class: Type[ClientResponse]=ClientResponse,
ws_response_class: Type[ClientWebSocketResponse]=ClientWebSocketResponse, # noqa
version: HttpVersion=http.HttpVersion11,
cookie_jar: Optional[AbstractCookieJar]=None,
connector_owner: bool=True,
raise_for_status: bool=False,
read_timeout: Union[float, object]=sentinel,
conn_timeout: Optional[float]=None,
timeout: Union[object, ClientTimeout]=sentinel,
auto_decompress: bool=True,
trust_env: bool=False,
requote_redirect_url: bool=True,
trace_configs: Optional[List[TraceConfig]]=None) -> None:
- 使用
cookies = {"cookies":"xxxxxxxxxx"}
async with ClientSession(cookies=cookies) as session:
...
7.获取网站响应状态码
-
res.status
async with session.get(url) as res: print(res.status)
8.查看响应头
- res.headers 查看响应头,得到值类型是一个dick
- res.raw_headers 查看原生响应头,字节类型
import asyncio
import aiohttp
async def get_page(url):
async with aiohttp.ClientSession() as session:
headers = {'Content-Type':'text/html; charset=utf-8'}
async with session.get(url,headers=headers) as res:
for item,values in res.headers.items():
print(item,"*******",values)
c = get_page(urls,)
task = asyncio.ensure_future(c)
loop = asyncio.get_event_loop()
loop.run_until_complete(task)
9.查看重定向的响应头
- res.history
10.超时处理
-
默认IO操作都有5分钟响应时间,但是时间太长,我们可以自己设置timeout
-
如果timeout=None或timeout=0将不进行超时检查。也就不限时长。
async with session.get("https://baidu.com",timeout=60) as res: pass
11.ClientSession用于多个连接之间(同一个网站)共享cookie.
import aiohttp
import asyncio
async def request():
#设置一个cookies
cookies = {"my_cookie":"my_set_cookies"}
async with aiohttp.ClientSession(cookies=cookies) as session:
async with session.get("https://www.csdn.net/") as res:
print(session.cookie_jar.filter_cookies("https://www.csdn.net/nav/python"))
print("*******************************************")
async with session.get("https://www.csdn.net/") as res:
print(session.cookie_jar.filter_cookies("https://www.csdn.net/nav/java"))
c = request()
task = asyncio.ensure_future(c)
loop = asyncio.get_event_loop()
loop.run_until_complete(task)
#Set-Cookie: dc_session_id=10_1562499942692.566280
#Set-Cookie: my_cookie=my_set_cookies
#Set-Cookie: uuid_tt_dd=10_20709428800-1562499942692-906566
#*******************************************
#Set-Cookie: dc_session_id=10_1562499942692.566280
#Set-Cookie: my_cookie=my_set_cookies
#Set-Cookie: uuid_tt_dd=10_20709428800-1562499942692-906566
-
最好使用session.cookie_jar.filter_cookies()获取网站cookie,不同于requests模块,虽然我们可以使用res.cookies有可能获取到cookie,但似乎并未获取到所有的cookies。
-
总结
1.当我们使用res.cookie时,只会获取到当前url下设置的cookie,不会维护整站的cookie 2.而session.cookie_jar.filter_cookies(url)会一直保留这个网站的所有设置cookies,含有我们在会话时设置的cookie,并且会根据响应修改更新cookie。这个才是我们需要的 3.而我们设置cookie,也是需要在aiohttp.ClientSession(cookies=cookies)中设置 4.ClientSession 还支持 请求头,keep-alive连接和连接池(connection pooling)
12.cookie的安全性
-
默认ClientSession使用的是严格模式的 aiohttp.CookieJar. RFC 2109,明确的禁止接受url和ip地址产生的cookie,只能接受 DNS 解析IP产生的cookie。可以通过设置aiohttp.CookieJar 的 unsafe=True 来配置
jar = aiohttp.CookieJar(unsafe=True) session = aiohttp.ClientSession(cookie_jar=jar)
13控制连接数量
-
TCPConnector维持链接池,限制并行连接的总量,当池满了,有请求退出再加入新请求
async def request(): cookies = {"my_cookies":"my_cookies"} #限制并行的数量 conn = aiohttp.TCPConnector(limit=5) async with aiohttp.ClientSession(cookies=cookies,connector=conn) as session: pass c = request() task = asyncio.ensure_future(c) loop = asyncio.get_event_loop() loop.run_until_complete(task)
-
限制同时打开连接到同一端点的数量,可以通过设置 limit_per_host 参数:
limit_per_host: 同一端点的最大连接数量。同一端点即(host, port, is_ssl)完全相同情况。 conn = aiohttp.TCPConnector(limit_per_host=30)#默认是0
14一个小例子
import asyncio
import aiohttp
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
}
def callback(task):
#回调函数可以对页面进行解析,这里图省事就打印了
print(len(task.result()))
async def res(url):
async with aiohttp.request('GET',url,headers=headers)as fp:
#
response =await fp.read()
#因访问3个网站编码方式不同,统一转码(ISO-8859-1比较全)
response = response.decode('iso-8859-1')
# 返回给回调好书
return response
urls = [
'https://www.baidu.com',
'https://www.sogou.com',
'https://www.qq.com',
]
#proxy="http://some.proxy.com"
if __name__ == '__main__':
#创建
stasks = []
for url in urls:
#创建协程对象
c = res(url)
#封装任务对象
task = asyncio.ensure_future(c)
#给任务对象绑定回调函数
task.add_done_callback(callback)
#添加列表中
stasks.append(task)
# 创建一个事件循环对象
loop = asyncio.get_event_loop()
#将任务对象列表注册到事件循环对象中并且开启事件循环
loop.run_until_complete(asyncio.wait(stasks))
- 源文来自于https://www.jb51.net/article/163537.htm