• 第10课-队列、使用多线程和队列的爬虫案例


    1、队列代码示例

    import threading
    import time
    from queue import Queue
    '''
    Queue是线程安全的队列
    '''
    def set_data(q):
        index = 0
        while True:
            q.put(index)
            index += 1
            time.sleep(3)
    
    def get_data(q):
        while True:
            print(q.get())
    
    if __name__ == '__main__':
        q = Queue(4)
        t1 = threading.Thread(target=set_data,args=[q])
        t2 = threading.Thread(target=get_data,args=[q])
        t1.start()
        t2.start()
        q = Queue(1)
        q.put(1)
        q.get(timeout=1)
        print(q.empty())
        print(q.full(timeout=1))
        print(q.qsize())
    

    2、斗图爬虫实战

    import requests
    import threading
    from queue import Queue
    from lxml import etree
    from urllib import  request
    
    g_flag = True
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
    }
    
    #爬取图片地址
    def put_picture_link(q):
        global g_flag
        for i in range(1,11):
            text = requests.get(url="http://www.doutula.com/article/list/?page={}".format(i), headers=HEADERS).text
            html = etree.HTML(text)
            imgs_elements = html.xpath(
                '//div[@class="col-sm-9 center-wrap"]/a/div[@class="random_article"]/div/img [@class!="gif"]')
            for img_element in imgs_elements:
                image_link = img_element.xpath("@data-original")[0]
                q.put(image_link)
        g_flag = False
    
    
    #下载图片
    def download(q):
        index = 1
        while g_flag or q.qsize()>0:
            img_link = q.get(timeout=1)
            result = requests.get(url=img_link)
            if result.status_code == 200:
                my_picture = result.content
                append = img_link.split(".")[-1]
                with open("c://pictures/{}.{}".format(index, append), "wb") as fp:
                    fp.write(my_picture)
                #文件下载
                # request.urlretrieve(url=img_link,filename="c://pictures/{}.{}".format(index, append))
                index += 1
    
    
    if __name__ == '__main__':
        q = Queue(10)  #初始化队列
        t1 = threading.Thread(target=put_picture_link,args=[q])
        t1.start()
        t2 = threading.Thread(target=download,args=[q])
        t2.start()
        print("主线程执行完毕!!!")
    

    3、百思不得姐爬虫实战

    """百思不得姐爬虫实战"""
    import threading
    from lxml import etree
    import requests
    from queue import Queue
    import csv
    
    g_Lock = threading.Lock()
    g_flag = True
    
    DOMAIN = "http://www.budejie.com/"
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
    }
    #生产者
    class Producer(threading.Thread):
        def __init__(self,queue_url,queue_content):
            super(Producer,self).__init__()
            self.__queue_url = queue_url
            self.__queue_content = queue_content
        def run(self):
            global g_flag
            # count = 1
    
            while self.__queue_url.qsize()>0:
                url = self.__queue_url.get()
                text = requests.get(url= url,headers = HEADERS).text
                html = etree.HTML(text)
                contents = html.xpath('//div[@class="g-mn"]//div[@class="j-r-list"]//ul//div[@class="j-r-list-c-desc"]/a')
    
                for c in contents:
                    content = c.xpath("text()")[0].replace(r"u200b","")
                    link = DOMAIN + c.xpath("@href")[0]
                    content_dict = {}
                    content_dict["段子"] = content
                    content_dict["链接"] = link
                    self.__queue_content.put(content_dict)
                # print("第{}个页面请求成功".format(count))
                # count += 1
    
            g_flag = False
    
            print("-----------------------所有请求已完成---------------")
    
    #消费者
    class Consumer(threading.Thread):
        def __init__(self,queue_content,writer,i):
            super(Consumer,self).__init__()
            self.__queue_content = queue_content
            self.__writer = writer
            self.__i = i
    
        def run(self):
            print("----dddddddddddddd---")
            while True:
                if self.__queue_content.qsize()>0 or g_flag :
                    try:
                        content_dict = self.__queue_content.get(timeout=1)
                        g_Lock.acquire()
                        self.__writer.writerow(content_dict)
                        g_Lock.release()
                    except Exception as e:
                        print("队列为空{}".format(e))
                else:
                    break
                print("线程{}".format(self.__i),g_flag, self.__queue_content.qsize())
    
    
    
    if __name__ == '__main__':
        q_url = Queue(100)
        q_content = Queue(100)
        for i in range(1,25):
            q_url.put("http://www.budejie.com/text/{}".format(i))
    
        header = ["段子","链接"]
        fp = open("text.csv","w",encoding="utf-8",newline="")
        writer = csv.DictWriter(fp,header)
        writer.writeheader()
        for i in range(0,1):
            c = Consumer(q_content,writer,i)
            c.start()
        p = Producer(q_url,q_content)
        p.start()
    

      

  • 相关阅读:
    redis 命令
    继续node爬虫 — 百行代码自制自动AC机器人日解千题攻占HDOJ
    redis Ok2
    ThinkPHP5 清除runtime缓存文件
    linux系统下使用xampp 丢失mysql root密码 只能远程访问,本地无法连接数据库
    yii2 验证规则使用方法
    thinkphp5 模型表关联
    PHP将base64数据流转换成图片并保存
    Win10下80端口被System占用导致Apache无法启动
    3. Git与TortoiseGit基本操作
  • 原文地址:https://www.cnblogs.com/win0211/p/12144549.html
Copyright © 2020-2023  润新知