通过生产者-消费者模式实现多线程爬取图片:
1、生产者通过不断爬取网页中图片的url存入图片队列中
2、消费者通过图片队列中的url爬取图片并下载到本地
3、多线程的方式,爬取与下载同时进行,直到子线程结束,输出爬取时间
1 #多线程下载图片 2 #生产者-消费者模式 3 import requests,queue,threading 4 from lxml import etree 5 from queue import Queue 6 import os,time 7 #生产者,将每个图片的url放入队列中 8 class product(threading.Thread): 9 headers = { 10 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" 11 } 12 #初始化 13 def __init__(self,page_url,img_url): 14 super(product,self).__init__() 15 self.page_url = page_url 16 self.img_url = img_url 17 #子线程运行 18 def run(self): 19 while True: 20 #如果图片url爬去完毕,则跳出循环,结束子线程 21 if self.page_url.empty(): 22 break; 23 24 #获取将爬取的页面url,并解析 25 html = self.page_url.get() 26 self.parse_html(html) 27 #获取图片的url 28 def parse_html(self,html): 29 request = requests.get(html, headers=self.headers) 30 html = etree.HTML(request.text) 31 #解析html 32 imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']", encodding='gbk') 33 for img in imgs: 34 href = img.get("data-original") 35 name = img.get("alt") 36 #获取后缀 37 suffix = os.path.splitext(href)[1] 38 filename = "img\" + name + suffix 39 print(filename) 40 #将爬取的图片url和图片名字加入图片队列中 41 self.img_url.put((href,filename)) 42 #消费者,根据图片url爬取图片 43 class consume(threading.Thread): 44 def __init__(self,page_url,img_url): 45 super(consume,self).__init__() 46 self.page_url = page_url 47 self.img_url = img_url 48 def run(self): 49 while True: 50 #如果图片队列为空且页面队列为空,则推出循环 51 if self.page_url.empty() and self.img_url.empty(): 52 break 53 href,filename = self.img_url.get() 54 res = requests.get(href) 55 with open(filename, 'wb') as fp: 56 fp.write(res.content) 57 def main(): 58 #基本url 59 base_url = "http://www.doutula.com/photo/list/?page={}" 60 #设置页面队列为10,图片队列为200 61 page_url = Queue(10) 62 img_url = Queue(200) 63 #把需要爬取的网页url放入队列中 64 for i in range(1,10): 65 url = base_url.format(i) 66 page_url.put(url) 67 #设置生产者,开始爬取每个网页的图片地址 68 for i in range(8): 69 t = product(page_url,img_url) 70 t.start() 71 #设置消费者,根据图片队列中的url爬取图片 72 for j in range(8): 73 tt = consume(page_url,img_url) 74 tt.start() 75 #设置消费者子线程运行完之后,才会推出main主线程 76 tt.join() 77 78 if __name__ == '__main__': 79 times = time.time() 80 tmain = threading.Thread(target=main) 81 tmain.start() 82 tmain.join() 83 times1 = time.time() 84 print(times1-times)