• 多线程爬虫------qiushi


    import requests
    from lxml import etree
    import time
    import threading
    from queue import Queue
    import random

    class QiushiSpider(object):
        def __init__(self):
            self.base_url = 'https://www.qiushibaike.com/8hr/page/{}'
            self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"}
            #记录总个数
            self.count = 0
            #创建队列
            self.url_queue = Queue()
            #创建reponse队列
            self.response_queue = Queue()
            #创建data队列
            self.data_queue = Queue()


        #组和所有的URL,遍历了所有页
        def get_url_list(self):
            for i in range(1, 14):
                self.url_queue.put(self.base_url.format(i))

        def send_request(self):
            while True:
                time.sleep(1)
                #循环从队列中取出url
                url = self.url_queue.get()
                print(url)
                proxy_list = [
                    {"http": '111.155.116.229:8123'},
                    {"http": '61.135.217.7:80'},
                ]
                #随机在列表中取出proxy
                proxy = random.choice(proxy_list)
                response = requests.get(url,headers = self.headers,proxies = proxy).content.decode()
                if response.status_code == 200:
                    #响应对象入队列
                    self.reponse_queue.put(response)
                else:#如果不成功URL再发送,入队列
                    self.url_queue.put(url)
                #队列计数器减一
                self.url_queue.task_done()

        def anlysis_data(self,data):
            html_data = etree.HTML(data)
            #取出所有段子的div
            div_list = html_data.xpath('//div[@id="content-left"]/div')

            for div in div_list:
                # 解析昵称
                nick_name = div.xpath('.//h2/text()')[0]
                print(nick_name)
                self.count += 1
                self.data_queue.put(nick_name)


        def save_data(self,data):
            #循环取出数据进行保存
            while True:
                nick_name = self.data_queue.get()
                print(nick_name)
                self.data_queue.task_done()

    #开始爬取
        def work_spider(self):
            th_list = []
            #组合url——list
            th_url = threading.Thread(target=self.get_url_list)
            th_list.append(th_url)

            #发送请求
            for i in range(1,2):
                th_request = threading.Thread(target=self.send_request)
                th_list.append(th_request)

            #解析
            th_analysis = threading.Thread(target=self.anlysis_data)
            th_list.append(th_analysis)

            #保存
            th_save = threading.Thread(target=self.save_data)
            th_list.append(th_save)

            #开启线程
            for th in th_list:
                #开启守护
                th.setDaemon(True)
                th.start()

            #队列阻塞主线程
            for qu in [self.url_queue, self.response_queue, self.data_queue]:
                qu.join()

        def run(self):
            start_time = time.time()
            self.work_spider()
            end_time = time.time()
            print('总共耗时:{}'.format(end_time - start_time))
            print('总个数:{}'.format(self.count))

    if __name__ == '__main__':
        QiushiSpider().run()




























  • 相关阅读:
    oracle 排序字段自增长
    ORACLE REPLACE函数
    oracle非空不做更新
    Elasticsearch 5.4.3 聚合分组
    Elasticsearch 版本控制
    Elasticsearch 配置同义词
    Elasticseach的评分机制
    实现Ecshop商品跳到淘宝、京东等的购买链接
    修改ECSHOP系统红包序列号规律
    Ecshop后台订单列表增加”商品名”检索字段
  • 原文地址:https://www.cnblogs.com/hanjian200ok/p/9487523.html
Copyright © 2020-2023  润新知