协程
1.1 我的理解
在函数之间切换,有些代码需要io,就要立马切换运行其他函数,协程最重要的就是事件循环,去查看哪些io好了,好了就执行,没好就去执行其他的函数,遇到io的就来回切。单线程内函数切换不耗资源。
1.2 asyncio
1.包含各种特定系统实现的模块化事件循环
2.传输层和协议抽象 TCP、UDP等
3.模仿futures模块但适用于事件循环使用的Future类
4.同步代码编写并发代码
5.当必须使用一个将产生阻塞IO对的调用时,有接口可以把这个给事件提交给线程池。
asyncio 是解决异步的一整套方案
tornado(实现了web服务器)、gevent、twisted、sanic、
1.2.1 asyncio 简单使用
import asyncio
import time
# 事件循环+回调(驱动生成器、协程)+ epoll
async def test():
print('start get url')
#time.sleep(2)
await asyncio.sleep(2)
print('end get url')
start_time = time.time()
loop = asyncio.get_event_loop()
tasks = [test() for i in range(10)]
loop.run_until_complete(asyncio.wait(tasks))
print(time.time()-start_time)
done,pending = await asyncio.wait(task_list) # done 是已经完成的,pending是没有完成的,wait可以设置timeout,在timeout还没有执行完的就会放到pending中
1.2.2 获取协程返回值
# v1
async def test():
print('start get url')
#time.sleep(2)
await asyncio.sleep(2)
print('end get url')
return 1
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(test())
loop.run_until_complete(future)
print(future.result())
# v2
async def test():
print('start get url')
#time.sleep(2)
await asyncio.sleep(2)
print('end get url')
return 1
loop = asyncio.get_event_loop()
# future = asyncio.ensure_future(test())
task = loop.create_task(test())
loop.run_until_complete(task)
print(task.result())
1.2.3 callback 回调
from functools import partial # 偏函数
async def test():
print('start get url')
#time.sleep(2)
await asyncio.sleep(2)
print('end get url')
return 1
def callback(name,future):
print(name,future.result())
loop = asyncio.get_event_loop()
# future = asyncio.ensure_future(test())
task = loop.create_task(test())
task.add_done_callback(partial(callback,"lqw"))
loop.run_until_complete(task,)
1.2.4 wait 和 gather
1.2.4.1 wait
async def test(time):
print('start get url')
#time.sleep(2)
await asyncio.sleep(time)
print('end get url')
return 1
def callback(name,future):
print(name,future.result())
import time
import concurrent
start_time = time.time()
loop = asyncio.get_event_loop()
tasks = [ test(i) for i in range(1,3)]
loop.run_until_complete(asyncio.wait(tasks,return_when=concurrent.futures.FIRST_COMPLETED))
print(time.time() -start_time) # 1.几秒。
1.2.4.2 gather
async def test(time):
print('start get url')
#time.sleep(2)
await asyncio.sleep(time)
print('end get url')
return 1
def callback(name,future):
print(name,future.result())
import time
import concurrent
start_time = time.time()
loop = asyncio.get_event_loop()
tasks = [ test(i) for i in range(1,3)]
# task.add_done_callback(partial(callback,"lqw"))
loop.run_until_complete(asyncio.gather(*tasks)) # 解析
print(time.time() -start_time)
gather 更加高层。可以分组
group1 = [task1,task2]
group2 = [task3,task4]
group1 = asyncio.gather(*group1)
group2 = asyncio.gather(*group2)
loop.run_until_complete(asyncio.gather(group1,group2))
1.2.4.3 取消 task(future)
async def get_html(time):
print('get ')
await asyncio.sleep(time)
print(' end')
task1 = get_html(2)
task2 = get_html(3)
task3 = get_html(3)
tasks = [task1,task2,task3]
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(asyncio.wait(tasks))
except:
all_tasks = asyncio.Task.all_tasks()
for task in all_tasks:
task.cancel()
loop.stop()
loop.run_forever() # 一定要
finally:
loop.close()
1.2.5 call_soon、call_later、call_at、call_soon_threadsafe
def callback(sleep_times):
print('shijian %s'%sleep_times)
def stop(loop):
loop.stop()
loop = asyncio.get_event_loop()
time = loop.time()
loop.call_at(time+1,callback,2) # 以loop.time() 为准
loop.call_soon(callback,2)
loop.call_soon(stop,loop)
loop.call_later(2,callback,1) # 延迟后执行
loop.call_soon_threadsafe() # 线程安全的。
loop.run_forever()
1.2.6 结合 线程池
from concurrent.futures import ThreadPoolExecutor
# 运用多线程
def get_url(s):
time.sleep(s)
print('end',s)
return 111
if __name__ == '__main__':
loop = asyncio.get_event_loop()
executor = ThreadPoolExecutor()
tasks = [loop.run_in_executor(executor,get_url,i) for i in range(1,3)]
loop.run_until_complete(asyncio.wait(tasks))
1.2.7 asyncio 模拟 http请求,(as_completed的使用)
from urllib.parse import urlparse
async def get_url(url):
url_obj = urlparse(url)
host = url_obj.netloc
address = url_obj.path if url_obj.path else '/'
reader,writer = await asyncio.open_connection(host,80)
writer.write(f"GET {address} HTTP/1.1\r\nHost:{host}\r\nConnection:close\r\n\r\n".encode('utf8'))
content_list = []
async for content in reader: # 实现了 anext:
data = content.decode('utf8')
content_list.append(data)
# data = await reader.read() # 也可以用这个。
# print(data.decode('utf8'))
return '\n'.join(content_list)
# pprint.pprint(data.split('\r\n\r\n')[1])
async def main(loop):
tasks = [asyncio.ensure_future(get_url('http://www.baidu.com')) for i in range(10)]
for tasks in asyncio.as_completed(tasks):
result = await tasks
print(result)
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
1.2.8 task 和 future
将 task 和 线程 池的future统一 task是future的子类。 线程不需要 send(None),协程必须要启动,抽象出了task方法,task 是 future的桥梁。设计角度,解决协程和线程不一样之间的地方。
1.2.9 协程锁
import asyncio
import aiohttp
cache = {}
lock = asyncio.Lock()
queue = asyncio.Queue() # 单线程也可以达到消息通信的机制
async def get_stuff(url):
async with lock:
if url in cache:
return cache[url]
stuff = await aiohttp.request('get',url)
cache[url] = stuff
return stuff
async def test1():
res = await get_stuff('dada')
async def test2():
res1 = await get_stuff('dada')
1.3.0 异步爬虫
import aiohttp
import asyncio
import re
import aiomysql
# 爬虫,去重,入库
from pyquery import PyQuery
waiting_urls = []
seen_urls = set()
stopping = False
async def fetch(url,session):
async with session.get(url) as response:
if response.status in [200,201]:
html = await response.text()
return html
def parse_contene(html):
urls = []
pq = PyQuery(html)
for link in pq.items('a'):
url = link.attr('href')
print(url)
if url and url.startswith('http') and url not in seen_urls:
urls.append(url)
waiting_urls.append(url)
return urls
async def consumer(pool):
async with aiohttp.ClientSession() as session:
while not stopping:
if len(waiting_urls) == 0:
await asyncio.sleep(0.5)
continue
url = waiting_urls.pop()
print('pop 出的',url)
if re.match('http://.*?jobbole.com.*?\d+.html',url):
if url not in seen_urls:
# 提交一个协程,解析一个
asyncio.ensure_future(article_handle(url,session,pool))
else:
asyncio.ensure_future(init_urls(url,session))
async def article_handle(url,session,pool):
# 获取详情
html = await fetch(url,session)
seen_urls.add(url)
parse_contene(html)
pq = PyQuery(html)
title = pq('title').text()
print('标题',title)
# async with pool.acquire() as conn:
# async with conn.cursor() as cur:
# # 插入。
# await cur.execute('')
async def init_urls(start_url,session):
html = await fetch(start_url,session)
parse_contene(html)
async def main(loop):
# 等待mysql连接好
start_url = 'http://www.jobbole.com/'
#pool = await aiomysql.connect(host='127.0.0.1', port=3306,
# user='root', password='', db='mysql',
# loop=loop,charset='utf8',autocommit=True)
pool = 'xx'
async with aiohttp.ClientSession() as session:
html = await fetch(start_url,session)
seen_urls.add(start_url)
parse_contene(html)
asyncio.ensure_future(consumer(pool))
if __name__ == '__main__':
loop = asyncio.get_event_loop()
asyncio.ensure_future(main(loop))
loop.run_forever()
1.4 set_result 的使用
import asyncio
loop = asyncio.get_event_loop()
def set_res(fur):
fur.set_result("test")
async def main():
fur = loop.create_future()
loop.call_later(3,set_res,fur)
res = await fur # 会一直等到有结果,如果不执行 loop.call_later(3,set_res,fur) 会一直卡住。
print(res)
loop.run_until_complete(main())
1.5 future 对象
asyncio 可以将线程池的future对象转成,asyncio 里的future 对象。
import time
loop = asyncio.get_event_loop()
def mysleep(s):
time.sleep(s)
async def demo():
print('start')
fur = loop.run_in_executor(None,mysleep,3) # 默认为线程池,也可以方进程池,变成future对象了
await fur
print('end')
tasks = [demo(),demo(),demo()]
loop.run_until_complete(asyncio.wait(tasks))
1.6 异步爬虫
import asyncio
import aiofiles
import aiohttp
import time
img_urls =['https://img.lianzhixiu.com/uploads/allimg/202010/9999/bfed842d57.jpg','https://img.lianzhixiu.com/uploads/allimg/202010/9999/7d0c02fd74.jpg','https://img.lianzhixiu.com/uploads/allimg/202010/9999/edb377d1b3.jpg']
async def get_images(url,session):
async with session.get(url) as response:
if response.status in [200,201]:
content = await response.content.read()
async with aiofiles.open(f'{url.split("/")[-1]}',"wb") as fp:
await fp.write(content)
async def main():
async with aiohttp.ClientSession() as session:
tasks = [asyncio.ensure_future(get_images(url,session)) for url in img_urls]
await asyncio.wait(tasks)
import requests
def request_get(url):
content = requests.get(url).content
with open(f'{url.split("/")[-1]}', "wb") as fp:
fp.write(content)
def main_v1():
for url in img_urls:
request_get(url)
if __name__ == '__main__':
# start_time = time.time()
# loop = asyncio.get_event_loop()
# loop.run_until_complete(main())
# print(time.time()-start_time)
start_time = time.time()
main_v1()
print(time.time()-start_time)
# 异步1秒
# 同步2秒
1.7 异步迭代器
import random
import time
import asyncio
class MyAsyncIter():
def __init__(self):
self.count = 0
def __aiter__(self):
return self
async def __anext__(self):
await asyncio.sleep(random.randint(1,3))
self.count+=1
print(self.count)
if self.count >10:
raise StopAsyncIteration
return self.count
async def test():
await asyncio.sleep(1)
print('asda')
await asyncio.sleep(2)
print('asda')
async def main():
asyncio.ensure_future(test())
async for i in MyAsyncIter():
print(i)
if __name__ == '__main__':
asyncio.run(main())
1.8 异步上下文管理器
import asyncio
import random
class MyEnter():
def __init__(self):
pass
async def __aenter__(self):
await asyncio.sleep(3)
print('返回了')
return self
async def close(self):
await asyncio.sleep(3)
print('close')
async def __aexit__(self, *args):
await self.close()
async def test():
await asyncio.sleep(4)
print('end')
async def main():
asyncio.ensure_future(test())
async with MyEnter() as d:
await d.close()
if __name__ == '__main__':
asyncio.run(main())