废话 不多说,直接上代码,不懂得看注释
先安装 pip install aiohttp
1 "异步抓取花瓣网图片" 2 3 # pip install aiohttp 4 import requests 5 import aiohttp 6 import asyncio 7 import time 8 import os 9 10 headers = { 11 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 12 "X-Request": "JSON", 13 "Accept": "application/json", 14 "X-Requested-With": "XMLHttpRequest" 15 } 16 17 18 def get_image_urls(): 19 """获取图片链接""" 20 print('开始获取图片链接,请耐心等待......') 21 image_id = 2551285279 22 for page in range(1,26): 23 url = "https://huaban.com/boards/41743806/?jzwfs8ej&max="+ str(image_id) +"&limit=20&wfl=1" 24 response = requests.get(url,headers=headers).json() 25 pins = response['board']['pins'] 26 end_pins_id = pins[-1]['pin_id'] 27 for i in pins: 28 key = i['file']['key'] 29 urls.append('http://hbimg.huabanimg.com/'+key) 30 image_id = end_pins_id # 下一页url中需要的参数 31 32 33 #特殊的函數:该函数调用后,函数内部的程序语句不会被执行,但是该函数调用会返回一个协程对象 34 async def get_audio_data(url): 35 #使用aiohttp进行请求发送 36 #实例化了一个发送网络请求的对象 37 async with aiohttp.ClientSession() as s: 38 #该函数内部的异步操作必须使用await进行修饰 39 async with await s.get(url=url,headers=headers) as response: 40 audio_data = await response.read() #read()返回的是二进制形式的响应数据 41 return {'data':audio_data,'url':url} 42 43 44 45 #任务对象的回调函数,进行数据的持久化存储 46 def saveData(task): 47 dic_obj = task.result() 48 name = dic_obj['url'].split('/')[-1] 49 data = dic_obj['data'] 50 image_dir = 'images' 51 if not os.path.exists(image_dir): 52 os.mkdir(image_dir) 53 with open(os.path.join(image_dir,name) + '.jpg','wb') as fp: 54 fp.write(data) 55 print(name+'下载完毕!') 56 57 58 if __name__ == '__main__': 59 start_time = time.clock() 60 urls = [] 61 tasks = [] 62 get_image_urls() 63 for url in urls: 64 #调用该特殊函数,让其返回一个协程对象 65 c = get_audio_data(url) 66 #将协程对象封装到任务对象中 67 task = asyncio.ensure_future(c) 68 # 给任务对象绑定回调函数 69 task.add_done_callback(saveData) 70 #将任务对象添加到列表中 71 tasks.append(task) 72 #创建一个事件循环对象 73 loop = asyncio.get_event_loop() 74 #将任务对象列表注册到事件循环对象中,并且开启事件循环 75 loop.run_until_complete(asyncio.wait(tasks)) 76 end_time = time.clock() 77 print('抓取{}张图片,共计用时{}秒'.format(len(tasks),end_time-start_time))
注:window最大线程数 512,所以任务数不要超过这个值,否则 抛出异常