• 【爬虫】多线程爬取糗事百科写入文件


    '''
    爬取糗事百科的段子,将内容和连接爬取下来,写入scv
    使用技术:多线程,锁,队列,xpath,csv
    '''
    
    import requests
    import csv
    from queue import Queue
    from lxml import etree
    import threading
    
    
    class Creeper(threading.Thread):
        def __init__(self,url_queue,content_queue,*args,**kwargs):
            super().__init__(*args,**kwargs)
            self.url_queue = url_queue
            self.content_queue = content_queue
    
        def run(self):
            while True:
                if self.url_queue.empty():
                    break
                url = self.url_queue.get()
                self.parse_page(url)
    
        def parse_page(self,url):
            headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"}
            response = requests.get(url,headers=headers)
            text = etree.HTML(response.text)
            divEle = text.xpath('//div[contains(@class,"article block")]')
            for div in divEle:
                content = div.xpath('.//a[@class="contentHerf"]//span[1]//text()')
                new_content = "
    ".join(list(map(lambda x:x.replace('
    ',''),content)))
                a_url = "https://www.qiushibaike.com" + div.xpath('.//a[@class="contentHerf"]/@href')[0]
                self.content_queue.put((new_content,a_url))
    
    class SaveFile(threading.Thread):
        def __init__(self,content_queue,writer,lock,*args,**kwargs):
            super().__init__(*args,**kwargs)
            self.content_queue = content_queue
            self.writer = writer
            self.lock = lock
    
        def run(self):
            while True:
                try:
                    content,link = self.content_queue.get(timeout=30)       # 设置超时时间
                    # 写入文件必须加锁
                    self.lock.acquire()
                    self.writer.writerow((content,link))
                    self.lock.release()
                    print('保存一条')
                except:
                    break
    
    
    def main():
        url_queue = Queue(100)
        content_queue = Queue(300)
        base_url = "https://www.qiushibaike.com/text/page/{}/"
        gLock = threading.Lock()
        # 解决写入中文乱码
        f = open('糗事百科.csv','a',encoding='utf-8-sig',newline="")
        header = ['content','link']
        writer = csv.writer(f)
        writer.writerow(header)
    
        for i in range(1,13):
            url = base_url.format(i)
            url_queue.put(url)
    
        for i in range(2):
            c = Creeper(url_queue, content_queue)
            c.start()
    
        for i in range(2):
            s = SaveFile(content_queue,writer,gLock)
            s.start()
    
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    《分析服务从入门到精通读书笔记》第五章、创建多维数据集之浏览数据(1)
    灵活运用SQL Server2008 SSIS变量
    SSIS几个通用属性
    [原创]Silverlight开发实践系列导航(提供源码)
    【转】Scott_ASP.NET MVC框架(第四部分) 处理表单编辑和提交场景
    C#数据结构(二)栈和队列
    C#数据结构(一)线性表
    django学习(一)
    谈谈电子商务网站的促销规则(定价策略)设计(三)
    GAE使用中的问题
  • 原文地址:https://www.cnblogs.com/st-st/p/10413603.html
Copyright © 2020-2023  润新知