• 表情包的同步异步下载


    同步下载

    from lxml import etree
    import requests
    from urllib import request #保存图片
    import os
    import re
    
    def parse_page(url):
        headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
        }
        response = requests.get(url=url,headers=headers).text
        tree = etree.HTML(response)
        images =tree.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]')
        for img in images:
            # 拿到img标签 但是img里的src显示的不是真正的图片,真正的图片在data-orginal里面(关键点)
            # print(etree.tostring(img))  # 查看img的html中样式
            img_url = img.get("data-original")  #获取到所有图片
            # get可以获取标签属性的某一个值  (关键点)
    
            pic_name = img.get("alt")
            pic_name = re.sub(r"[??.,。 !!]"," ",pic_name)
            # 获取后缀名
            suffix = os.path.splitext(img_url)[1]
            filename = pic_name + suffix
            # print(filename)
            request.urlretrieve(img_url,'images/'+filename)
    
    
    def main():
        for x in range(1,3): # 获取 1-3页
            url = 'http://www.doutula.com/photo/list/?page=%d' % x
            parse_page(url)
    
    
    if __name__ == '__main__':
        main()

    表情包的异步下载

    from lxml import etree
    import requests
    from urllib import request #保存图片
    import os
    import re
    from queue import Queue
    import threading
    
    
    class Producer(threading.Thread):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
        }
        def __init__(self,page_queue,img_queue,*args,**kwargs):
            super(Producer, self).__init__(*args,**kwargs)
            self.page_queue = page_queue
            self.img_queue = img_queue
    
        def run(self):
            while True:
                if self.page_queue.empty(): # 如果page队列中没有url就退出
                    break
                url = self.page_queue.get()
                self.parse_page(url)
    
        def parse_page(self,url):
            response = requests.get(url=url,headers=self.headers).text
            tree = etree.HTML(response)
            images =tree.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]')
            for img in images:
                img_url = img.get("data-original")  #获取到所有图片
                pic_name = img.get("alt")
                pic_name = re.sub(r"[??.,。 !!*]"," ",pic_name)
                # 获取后缀名
                suffix = os.path.splitext(img_url)[1]
                filename = pic_name + suffix
    
                # 拿到文件名之后 现在可以添加到img_queue 队列中了
                self.img_queue.put((img_url,filename))
    
    
    class Consumer(threading.Thread):
        def __init__(self,page_queue,img_queue,*args,**kwargs):
            super(Consumer, self).__init__(*args,**kwargs)
            self.page_queue = page_queue
            self.img_queue = img_queue
    
        def run(self):
            while True:
                # 两个队列都为空的时就退出
                if self.img_queue.empty() and self.page_queue.empty():
                    break
                img_url,filename = self.img_queue.get() # 把上面封装的元组进行解包 (注意)
                request.urlretrieve(img_url,'images/'+filename)
                print(filename," 下载完毕")
    
    def main():
        # 定义两个队列
        page_queue = Queue(100) # 爬100页
        img_queue = Queue(1000) # 存的图片 数值尽量设置大一点
    
        for x in range(1,101): # 获取 1-3页
            url = 'http://www.doutula.com/photo/list/?page=%d' % x
            # 每一页的url 放入队列
            page_queue.put(url)
    
        for x in range(5):
            t = Producer(page_queue,img_queue)
            t.start()
    
        for x in range(5):
            t = Consumer(page_queue,img_queue)
            t.start()
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    定时器与休眠
    进程的创建
    详述进程创建和程序执行
    线程
    会话,进程组,作业控制
    进程优先级和调度
    进程资源
    日志信息log
    能力
    浅谈产品模型(Profile)在程序设计中的作用
  • 原文地址:https://www.cnblogs.com/kenD/p/11123555.html
Copyright © 2020-2023  润新知