• Python 之糗事百科多线程爬虫案例


    import requests
    from lxml import etree
    import json
    import threading
    import queue
    
    
    # 采集html类
    class GetHtml(threading.Thread):
        def __init__(self, page_queue):
            threading.Thread.__init__(self)
            self.page_queue = page_queue
    
        def run(self):
            self.do_get_html()
    
        def do_get_html(self):
            headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"}
            global data_queue
            while True:
                if self.page_queue.empty():
                    break
                page = self.page_queue.get()
                url = "https://www.qiushibaike.com/8hr/page/%s/" % str(page)
                timeout = 5
                while timeout > 0:
                    try:
                        _response = requests.get(url, headers=headers)
                        html = _response.content
                        # 保存到待解析队列
                        data_queue.put(html)
                        break
                    except ConnectionError as e:
                        print(e)
                    timeout -= 1
                if timeout < 0:
                    print("time out, url: " + url)
    
    
    class ParseHtml(threading.Thread):
        def __init__(self):
            threading.Thread.__init__(self)
    
        def run(self):
            self.do_parse_data()
    
        def do_parse_data(self):
            global total, f
            while True:
                if data_queue.empty():
                    break
                try:
                    html = data_queue.get()
                    text = etree.HTML(html)
                    list_node = text.xpath("//li[contains(@id, 'qiushi_tag_')]")
                    for node in list_node:
                        username = node.xpath(".//a[@class='recmd-user']/img/@alt")[0]
                        user_img = node.xpath(".//a[@class='recmd-user']/img/@src")[0]
                        zan_num = node.xpath(".//div[@class='recmd-num']/span[position()=1]/text()")[0]
                        ping_num = node.xpath(".//div[@class='recmd-num']/span[position()=4]/text()")
                        content = node.xpath(".//a[@class='recmd-content']/text()")
                        if len(ping_num) > 0:
                            ping_num = ping_num[0]
                        else:
                            ping_num = 0
                        if len(content) > 0:
                            content = content[0]
                        else:
                            content = ""
                        result = {
                            "username": username,
                            "imgUrl": user_img,
                            "vote": zan_num,
                            "comments": ping_num,
                            "content": content
                        }
                        total += 1
                        f.write((json.dumps(result, ensure_ascii=False) + "
    ").encode("utf-8"))
                except RuntimeError as e:
                    print(e)
    
    
    def main():
        # 将采集到的html保存到队列
        for i in range(1, 21):
            page_queue.put(i)
        # 开启采集线程
        get_html_thread = []
        for i in range(100):
            get_html = GetHtml(page_queue)
            get_html.start()
            get_html_thread.append(get_html)
        # 等待所有采集线程完成
        for thread in get_html_thread:
            thread.join()
        # 开启解析线程
        parse_html_thread = []
        for i in range(100):
            parse_html = ParseHtml()
            parse_html.start()
            parse_html_thread.append(parse_html)
        # 等待所有解析线程完成
        for thread in parse_html_thread:
            thread.join()
        # 关闭文件
        f.close()
        print("采集数据完成,总共%s条数据" % total)
    
    
    if __name__ == '__main__':
        data_queue = queue.Queue()
        page_queue = queue.Queue()
        f = open("./qunaerwang.json", "wb")
        total = 0
        main()

    数据:

    {"username": "夲少姓〖劉〗", "imgUrl": "//pic.qiushibaike.com/system/avtnew/1187/11878716/thumb/20190520091055.jpg?imageView2/1/w/50/h/50", "vote": "873", "comments": "66", "content": "马中赤兔人中啥了?"}
    {"username": "一枕清霜゛", "imgUrl": "//pic.qiushibaike.com/system/avtnew/3371/33712263/thumb/20190511210156.jpg?imageView2/1/w/50/h/50", "vote": "1224", "comments": "7", "content": "一个段子手,一个神回复"}
    {"username": "窝里斗窝里", "imgUrl": "//pic.qiushibaike.com/system/avtnew/1427/14275616/thumb/20181228173532.jpg?imageView2/1/w/50/h/50", "vote": "418", "comments": "7", "content": "鹰科猛禽走路的姿势看上去总是屌屌的!!"}
    {"username": "2丫头还是个宝宝", "imgUrl": "//pic.qiushibaike.com/system/avtnew/2219/22190863/thumb/20190131225946.jpg?imageView2/1/w/50/h/50", "vote": "801", "comments": "26", "content": "都说孩子玩沙子有助于孩子的智力发育,所以家里买了一车沙子放院子给逗逗玩。逗逗拿了一个铲子和一个望远镜玩具,当着我的面把望远镜在埋沙子里。拉着我的手:妈妈,我在沙"}
    {"username": "★像风一样一样★", "imgUrl": "//pic.qiushibaike.com/system/avtnew/2716/27163432/thumb/20180306191622.JPEG?imageView2/1/w/50/h/50", "vote": "274", "comments": "8", "content": "去朋友家看到的,特殊的插排,一个插排才多少钱啊?"}
    {"username": "无语滴滴", "imgUrl": "//pic.qiushibaike.com/system/avtnew/3782/37821797/thumb/20190430173233.jpg?imageView2/1/w/50/h/50", "vote": "603", "comments": "15", "content": "朋友和他女友吵架闹分手,我们都去劝。他女友抹抹眼泪看着窗外说了一句话:“要不是还有几个快递在路上,我真想死了算了。”"}
    {"username": "愚人愚之不如愚己", "imgUrl": "//pic.qiushibaike.com/system/avtnew/1927/19270659/thumb/20160618154530.jpg?imageView2/1/w/50/h/50", "vote": "2055", "comments": "122", "content": "老司机你发了多大的誓?"}
    余下数据省略。。。

     往后思路:

      1、保存到数据库

      2、保存到redis中、然后再同步到数据库

  • 相关阅读:
    Drcom账户管理Server端解说
    Hadoop常见异常及其解决方式
    PHP 获取网络接口文件流
    【刷题小记67】三角形面积
    Tiny语言编译器简单介绍
    矩阵十题【六】 poj3070 Fibonacci
    函数名称
    设计模式--6大原则--开闭原则
    LeetCode96:Unique Binary Search Trees
    [Swift]LeetCode958. 二叉树的完全性检验 | Check Completeness of a Binary Tree
  • 原文地址:https://www.cnblogs.com/yang-2018/p/10943082.html
Copyright © 2020-2023  润新知