• 第二十二节 多线程爬取表情包


     1 import requests
     2 from lxml import etree
     3 from urllib import request
     4 import re
     5 import os
     6 from queue import Queue
     7 import threading
     8 
     9 
    10 '''
    11 这个程序有bug
    12 '''
    13 class Produce(threading.Thread):
    14     headers = {
    15         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    16     }
    17     def __init__(self, page_queue, image_queue, *args, **kwargs):
    18         super(Produce, self).__init__(*args, **kwargs)
    19         self.page_queue = page_queue
    20         self.image_queue = image_queue
    21 
    22     def run(self):
    23         while 1:
    24             if self.page_queue.empty():
    25                 break
    26             url = self.page_queue.get()
    27             self.parse_page(url)
    28 
    29     def parse_page(self,url):
    30         resp = requests.get(url, headers =self.headers)
    31         html = etree.HTML(resp.text)
    32         all_pic = html.xpath('//div[@class="col-xs-6 col-sm-3"]')
    33         for pic in all_pic:
    34             pic_url = pic.xpath('.//img//@data-original')[0]
    35             pic_name = pic.xpath('.//img//@alt')[0]
    36             pic_name =re.sub(r'[??.。,!!]', '', pic_name)
    37             kuozhanming = os.path.splitext(pic_url)[1]
    38             filmname = r'G:picktrue'+'\'+pic_name+kuozhanming
    39             self.image_queue.put((pic_url, filmname))
    40 
    41 
    42 class Consumer(threading.Thread):
    43     def __init__(self, page_queue, image_queue, *args, **kwargs):
    44         super(Consumer, self).__init__(*args, **kwargs)
    45         self.page_queue = page_queue
    46         self.image_queue = image_queue
    47 
    48     def run(self):
    49         while 1:
    50             if self.image_queue.empty() and self.page_queue.empty():
    51                 break
    52             pic_url, filmname = self.image_queue.get()
    53             request.urlretrieve(pic_url, filmname)
    54 
    55 
    56 def main():
    57     page_queue= Queue(20)
    58     image_queue = Queue(1000)
    59     for x in range(1,3):
    60         url = 'http://www.doutula.com/article/list/?page=%d'%x
    61         page_queue.put(url)
    62     for x in range(3):
    63         t = Produce(page_queue, image_queue)
    64         t.start()
    65     for x in range(3):
    66         t = Consumer(page_queue, image_queue)
    67         t.start()
    68 
    69 if __name__ == '__main__':
    70     main()
  • 相关阅读:
    在线|九月月考选填题
    函数$f(x)=e^xpm e^{-x}$相关
    偶函数性质的推广
    2020年全国卷Ⅱ卷文科数学选填题解析版
    2020年全国卷Ⅱ卷文科数学解答题解析版
    待定系数法
    特殊方法求函数解析式
    phd文献阅读日志-4.1
    phd文献阅读日志-1.2~3.2(1.2,2.1,2.2,3.1,3.2)
    完美解决linux下vim在终端不能用鼠标复制的问题
  • 原文地址:https://www.cnblogs.com/kogmaw/p/12507064.html
Copyright © 2020-2023  润新知