一 . 线程池和进程池
可以适当的使用,在大量的IO情况下有更好的方法
import time
from multiprocessing.dummy import Pool
def request(url):
print('正在下载->',url)
time.sleep(2)
print('下载完毕->',url)
start = time.time()
urls = [
'www.baidu.com',
'www.taobao.com',
'www.sougou.com'
]
pool = Pool(3)
pool.map(request,urls)
print('总耗时->',time.time()-start)
二 . 单线程+异步协程(高性能的异步爬虫)
event_loop:相当于无线循环,我们可以把特殊的函数注册到这个时间循环上,异步执行
coroutine:协程,就是被async修饰的函数
task:任务,它是对协程进一步封装,包含了协程的各个状态
future:将要执行的任务
async/await,这两个是需要重点了解的
事件循环
import asyncio
async def hello(name):
print('hello->'+ name)
# 获取一个协程对象
c = hello('attila')
# 创建一个事件循环
loop = asyncio.get_event_loop()
# 将协程对象注册到事件循环中,并且启动事件循环对象
loop.run_until_complete(c)
task
import asyncio
async def hello(name):
print('hello->'+ name)
# 获取一个协程对象
c = hello('attila')
# 创建一个事件循环
loop = asyncio.get_event_loop()
# 把协程封装到task中
task = loop.create_task(c)
print(task) # Task pending
# 将协程对象注册到事件循环中,并且启动事件循环对象
loop.run_until_complete(task)
print(task) # Task finished
future
import asyncio
async def hello(name):
print('hello->'+ name)
# 获取一个协程对象
c = hello('attila')
# 把协程封装到task中
task = asyncio.ensure_future(c)
# 将协程对象注册到事件循环中,并且启动事件循环对象
loop.run_until_complete(task)
绑定回调
import asyncio
def call_back(task):
print('---->',task.result())
async def hello(name):
print('hello->'+ name)
return name
# 获取一个协程对象
c = hello('attila')
# 把协程封装到task中
task = asyncio.ensure_future(c)
# 给任务绑定一个回调函数,这个call_back里面的参数就是绑定回到函数的task
task.add_done_callback(call_back)
# 将协程对象注册到事件循环中,并且启动事件循环对象
loop.run_until_complete(task)
多任务异步协程(这里需要用到一个新模块aiohttp,一定不能是requests,因为requests是一个非异步模块)
pip install aiohttp
import aiohttp
import asyncio
async def get_page(url):
async with aiohttp.ClientSession() as session:
async with await session.get(url=url) as response: # 只要有io操作的地方就要挂起(await)
page_text = await response.text()
print(page_text)
start = time.time()
# 这里的url是自己在后台搭建的服务器,没给url都是time.sleep(2)
urls = [
'http://127.0.0.1:5000/cat',
'http://127.0.0.1:5000/dog',
'http://127.0.0.1:5000/monkey',
'http://127.0.0.1:5000/cat',
'http://127.0.0.1:5000/dog',
'http://127.0.0.1:5000/monkey',
'http://127.0.0.1:5000/cat',
'http://127.0.0.1:5000/dog',
'http://127.0.0.1:5000/monkey',
]
tasks = []
loop = asyncio.get_event_loop()
for url in urls:
c = get_page(url)
task = asyncio.ensure_future(c)
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
print('总耗时->',time.time()-start) # 总耗时-> 2.053046464920044