• 爬取不得姐网站,利用多线程来爬取


    利用到的库

    • time, requests, lxml, queue, threading

    功能

    • 爬取不得姐网站中前二十页的段子数据
    import time
    import requests
    from lxml import etree
    from queue import Queue
    import threading
    
    
    class bsSpider:
        def __init__(self):
            self.baseUrl = "http://www.budejie.com/"
            self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36"}
            # URL队列
            self.urlQueue = Queue()
            # 相应html队列
            self.resQueue = Queue()
    
        # 生成url队列
        def getUrl(self):
            for pNumber in range(1, 21):
                url = self.baseUrl + str(pNumber)
                self.urlQueue.put(url)
    
        # 请求,得到相应的html,放到解析队列
        def getHtml(self):
            while True:
                # 1.从url队列中get值
                url = self.urlQueue.get()
                # 2.发请求,得响应,put到响应队列中
                res = requests.get(url, headers=self.headers)
                res.encoding = "utf-8"
                html = res.text
                self.resQueue.put(html)
                # 清除此任务
                self.urlQueue.task_done()
    
        # 解析页面方法
        def getText(self):
            while True:
                html = self.resQueue.get()
                parseHtml = etree.HTML(html)
                r_list = parseHtml.xpath('//div[@class="j-r-list-c-desc"]/a/text()')
                for r in r_list:
                    print(r+"
    ")
                self.resQueue.task_done()
    
        def run(self):
            # 空列表,用来存放
            thList = []
            # 生成URL队列
            self.getUrl()
            # 创建请求线程,放到列表中
            for i in range(10):
                thRes = threading.Thread(target=self.getHtml)
                thList.append(thRes)
            # 创建解析线程,放到列表中
            for i in range(3):
                thParse = threading.Thread(target=self.getText)
                thList.append(thParse)
            # 所有线程开始干活
            for th in thList:
                th.setDaemon(True)
                th.start()
    
            # 如果队列为空,则执行其他程序
            self.urlQueue.join()
            self.resQueue.join()
    
    
    if __name__ == '__main__':
        begin = time.time()
        spider = bsSpider()
        spider.run()
        end = time.time()
        print(end - begin)
    
  • 相关阅读:
    LintCode-乱序字符串
    LintCode-字符串查找
    LintCode-比较字符串
    LintCode-两个字符串是变位词
    LintCode-不同的子序列
    View(视图)——AutoCompleteTextView、Spinner
    View(视图)——GridView(点击长按删除图片)
    给图片下加字符
    View(视图)——ListView:BaseAdapter、SimpleCursorAdapter
    View(视图)——ListView:ArrayAdapter、SimpleAdapter
  • 原文地址:https://www.cnblogs.com/zengsf/p/10040162.html
Copyright © 2020-2023  润新知