同步下载
from lxml import etree import requests from urllib import request #保存图片 import os import re def parse_page(url): headers = { "User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" } response = requests.get(url=url,headers=headers).text tree = etree.HTML(response) images =tree.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]') for img in images: # 拿到img标签 但是img里的src显示的不是真正的图片,真正的图片在data-orginal里面(关键点) # print(etree.tostring(img)) # 查看img的html中样式 img_url = img.get("data-original") #获取到所有图片 # get可以获取标签属性的某一个值 (关键点) pic_name = img.get("alt") pic_name = re.sub(r"[??.,。 !!]"," ",pic_name) # 获取后缀名 suffix = os.path.splitext(img_url)[1] filename = pic_name + suffix # print(filename) request.urlretrieve(img_url,'images/'+filename) def main(): for x in range(1,3): # 获取 1-3页 url = 'http://www.doutula.com/photo/list/?page=%d' % x parse_page(url) if __name__ == '__main__': main()
表情包的异步下载
from lxml import etree import requests from urllib import request #保存图片 import os import re from queue import Queue import threading class Producer(threading.Thread): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" } def __init__(self,page_queue,img_queue,*args,**kwargs): super(Producer, self).__init__(*args,**kwargs) self.page_queue = page_queue self.img_queue = img_queue def run(self): while True: if self.page_queue.empty(): # 如果page队列中没有url就退出 break url = self.page_queue.get() self.parse_page(url) def parse_page(self,url): response = requests.get(url=url,headers=self.headers).text tree = etree.HTML(response) images =tree.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]') for img in images: img_url = img.get("data-original") #获取到所有图片 pic_name = img.get("alt") pic_name = re.sub(r"[??.,。 !!*]"," ",pic_name) # 获取后缀名 suffix = os.path.splitext(img_url)[1] filename = pic_name + suffix # 拿到文件名之后 现在可以添加到img_queue 队列中了 self.img_queue.put((img_url,filename)) class Consumer(threading.Thread): def __init__(self,page_queue,img_queue,*args,**kwargs): super(Consumer, self).__init__(*args,**kwargs) self.page_queue = page_queue self.img_queue = img_queue def run(self): while True: # 两个队列都为空的时就退出 if self.img_queue.empty() and self.page_queue.empty(): break img_url,filename = self.img_queue.get() # 把上面封装的元组进行解包 (注意) request.urlretrieve(img_url,'images/'+filename) print(filename," 下载完毕") def main(): # 定义两个队列 page_queue = Queue(100) # 爬100页 img_queue = Queue(1000) # 存的图片 数值尽量设置大一点 for x in range(1,101): # 获取 1-3页 url = 'http://www.doutula.com/photo/list/?page=%d' % x # 每一页的url 放入队列 page_queue.put(url) for x in range(5): t = Producer(page_queue,img_queue) t.start() for x in range(5): t = Consumer(page_queue,img_queue) t.start() if __name__ == '__main__': main()