from queue import Queue from lxml import etree import requests from urllib import request from threading import Thread import re, os class Producter(Thread): def __init__(self, page_queue, img_queue, *args, **kwargs): super(Producter,self).__init__(*args, **kwargs) self.page_queue = page_queue self.img_queue = img_queue self.head = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' def run(self): while True: url = self.page_queue.get() self.parse(url) def parse(self, url): res = requests.get(url, params=self.head) text = res.text html = etree.HTML(text) imgs = html.xpath('//div[@class="col-xs-6 col-sm-3"]//img[@class!="gif"]') print(imgs) for img in imgs: img_path = img.get('data-original') alt = img.get('alt') alt = re.sub(r'[??.。!!*]', '', alt) # 将特殊符号替换 sub = os.path.splitext(img_path) # 获取文件后缀 sub = re.sub(r'[(!dta)]', '', sub[1]) filename = './imgs/'+alt+sub print(img_path, filename) self.img_queue.put((img_path, filename)) class Consumer(Thread): def __init__(self, page_queue, img_queue, *args, **kwargs): super(Consumer, self).__init__(*args, **kwargs) self.page_queue = page_queue self.img_queue = img_queue def run(self): while True: url = self.img_queue.get() self.parse(url[0], url[1]) print('消费者') def parse(self, url, path): # 下载文件到指定位置 request.urlretrieve(url, path) def main(): page_queue = Queue(10) img_queue = Queue(10000) for i in range(1, 11): uri = 'https://www.doutula.com/article/list/?page='+str(i) page_queue.put(uri) for i in range(5): t1 = Producter(page_queue, img_queue) t1.start() for i in range(5): t2 = Consumer(page_queue, img_queue) t2.start() if __name__ == '__main__': main()
注意:
如果使用threading.Lock(),或者threading.Condition(),都是线程不安全的,它们都是锁,共同方法(lock.acquire(),lock.release()),只不过Condition()有多了几个方法,wait()、notify()、notify_all(),如果等待的情况下,使用wait()将不占用CPU,当用资源消耗时,notify唤醒等待的线程。Lock()一直占用CPU资源。感觉还是Queue好用是吧。