• 多线程爬虫


    糗事百科

     1 import requests
     2 from lxml import etree
     3 from queue import Queue
     4 import threading
     5 
     6 
     7 class Qiubai:
     8     def __init__(self):
     9         self.temp_url = "https://www.qiushibaike.com/8hr/page/{}/"
    10         self.headers= {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 
    11         10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}
    12         self.url_queue = Queue()
    13         self.html_queue = Queue()
    14         self.content_list_queue = Queue()
    15 
    16     def get_url_list(self):#获取url列表
    17         for i in range(1,14):
    18             self.url_queue.put(self.temp_url.format(i))
    19 
    20     def parse_url(self):
    21         while True: #在这里使用,子线程不会结束,把子线程设置为守护线程
    22             url = self.url_queue.get()
    23             # print(url)
    24             response = requests.get(url,headers=self.headers)
    25             self.html_queue.put(response.content.decode())
    26             self.url_queue.task_done()
    27 
    28 
    29     def get_content_list(self):  #提取数据
    30         while True:
    31             html_str = self.html_queue.get()
    32             html = etree.HTML(html_str)
    33             div_list = html.xpath("//div[@id='content-left']/div")
    34             content_list = []
    35             for div in div_list:
    36                 content = {}
    37                 content["content"] = div.xpath(".//div[@class='content']/span/text()")
    38                 content_list.append(content)
    39             self.content_list_queue.put(content_list)
    40             self.html_queue.task_done()
    41 
    42     def save_content_list(self):
    43         while True:
    44             content_list = self.content_list_queue.get()
    45             for content in content_list:
    46                 print(content) # 此处对数据进行保存操作
    47             self.content_list_queue.task_done()
    48 
    49 
    50     def run(self):
    51         thread_list = []
    52         #1.url_list
    53         t_url = threading.Thread(target=self.get_url_list)
    54         thread_list.append(t_url)
    55         #2.遍历,发送请求,
    56         for i in range(3):  #三个线程发送请求
    57             t_parse = threading.Thread(target=self.parse_url)
    58             thread_list.append(t_parse)
    59         #3.提取数据
    60         t_content = threading.Thread(target=self.get_content_list)
    61         thread_list.append(t_content)
    62         #4.保存
    63         t_save = threading.Thread(target=self.save_content_list)
    64         thread_list.append(t_save)
    65 
    66         for t in thread_list:
    67             t.setDaemon(True)  #把子线程设置为守护线程,当前这个线程不重要,主线程结束,子线程技术
    68             t.start()
    69 
    70         for q in [self.url_queue,self.html_queue,self.content_list_queue]:
    71             q.join()  #让主线程阻塞,等待队列的计数为0,
    72 
    73         print("主线程结束")
    74 
    75 if __name__ == '__main__':
    76 
    77     qiubai = Qiubai()
    78     qiubai.run()
  • 相关阅读:
    信息安全基本概念
    GmSSL开发环境搭建及双证书生成
    Git使用教程
    linux软件管理
    在Windows中查看文件的MD5值
    odoo显示页面格式化日期的一个方法
    Ubuntu 查看文件夹大小
    docker容器中启动postgresql 9.5失败:could not locate a valid checkpoint record
    如何让postgresql日志记录所有的执行语句
    vim快捷键
  • 原文地址:https://www.cnblogs.com/wanglinjie/p/9310329.html
Copyright © 2020-2023  润新知