• 喜欢去知乎炸鱼?用python吧


    喜欢去知乎炸鱼?用python吧

     

    知乎高赞贴:

    有一双大长腿是什么体验?

    有一副迷人的身材是什么体验?

    别用手机费劲的翻了,python帮你一臂之力

    复制代码
    import re
    import requests
    import os
    import urllib.request
    import ssl
    
    from urllib.parse import urlsplit
    from os.path import basename
    
    # 全局禁用证书验证
    ssl._create_default_https_context = ssl._create_unverified_context
    
    headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
        'Accept-Encoding': 'gzip, deflate'
    }
    
    
    def mkdir(path):
        if not os.path.exists(path):
            print('新建文件夹:', path)
            os.makedirs(path)
            return True
        else:
            print(u"图片存放于:", os.getcwd() + os.sep + path)
            return False
    
    
    def download_pic2(img_lists, dir_name):
        print("一共有{num}张照片".format(num=len(img_lists)))
    
        # 标记下载进度
        index = 1
    
        for image_url in img_lists:
            file_name = dir_name + os.sep + basename(urlsplit(image_url)[2])
    
            # 已经下载的文件跳过
            if os.path.exists(file_name):
                print("文件{file_name}已存在。".format(file_name=file_name))
                index += 1
                continue
    
            auto_download(image_url, file_name)
    
            print("下载{pic_name}完成!({index}/{sum})".format(pic_name=file_name, index=index, sum=len(img_lists)))
            index += 1
    
    
    def auto_download(url, file_name):
        # 递归下载,直到文件下载成功
        try:
            urllib.request.urlretrieve(url, file_name)
        except urllib.request.ContentTooShortError:
            print("文件下载不完整,重新下载。")
            auto_download(url, file_name)
        except urllib.request.URLError:
            print("网络连接出错,尝试重新下载。")
            auto_download(url, file_name)
    
    
    def download_pic(img_lists, dir_name):
        print("一共有{num}张照片".format(num=len(img_lists)))
        for image_url in img_lists:
            response = requests.get(image_url, stream=True)
            if response.status_code == 200:
                image = response.content
            else:
                continue
    
            file_name = dir_name + os.sep + basename(urlsplit(image_url)[2])
    
            try:
                with open(file_name, "wb") as picture:
                    picture.write(image)
            except IOError:
                print("IO Error
    ")
                continue
            finally:
                picture.close()
    
            print("下载{pic_name}完成!".format(pic_name=file_name))
    
    
    def get_image_url(qid, headers):
        # 利用正则表达式把源代码中的图片地址过滤出来
        # reg = r'data-actualsrc="(.*?)">'
        tmp_url = "https://www.zhihu.com/node/QuestionAnswerListV2"
        size = 10
        image_urls = []
    
        session = requests.Session()
    
        while True:
            postdata = {'method': 'next',
                        'params': '{"url_token":' + str(qid) + ',"pagesize": "10",' + '"offset":' + str(size) + "}"}
            page = session.post(tmp_url, headers=headers, data=postdata)
            ret = eval(page.text)
            answers = ret['msg']
            print(u"答案数:%d" % (len(answers)))
    
            size += 10
    
            if not answers:
                print("图片 URL 获取完毕, 页数: ", (size - 10) / 10)
                return image_urls
    
            # reg = r'https://picd.zhimg.com/[a-fA-F0-9]{5,32}_w+.jpg'
            imgreg = re.compile('data-original="(.*?)"', re.S)
    
            for answer in answers:
                tmp_list = []
                url_items = re.findall(imgreg, answer)
    
                for item in url_items:  # 这里去掉得到的图片 URL 中的转义字符'\'
                    image_url = item.replace("\", "")
                    tmp_list.append(image_url)
    
                # 清理掉头像和去重 获取 data-original 的内容
                tmp_list = list(set(tmp_list))  # 去重
                for item in tmp_list:
                    if item.endswith('r.jpg'):
                        print(item)
                        image_urls.append(item)
    
            print('size: %d, num : %d' % (size, len(image_urls)))
    
    
    if __name__ == '__main__':
        title = '拥有一副令人羡慕的好身材是怎样的体验?'
        question_id = 297715922
    
        # title = '身材好是一种怎样的体验?'
        # question_id = 26037846
    
        # title = '女孩子胸大是什么体验?'
        # question_id = 291678281
    
        # title = '女生什么样的腿是美腿?'
        # question_id = 310786985
    
        # title = '你的择偶标准是怎样的?'
        # question_id = 275359100
    
        # title = '什么样才叫好看的腿?'
        # question_id = 63727821
    
        # title = '身材对女生很重要吗?'
        # question_id = 307403214
    
        # title = '女生腿长是什么样的体验?'
        # question_id = 273711203
    
        # title = '女生腕线过裆是怎样一种体验?'
        # question_id = 315236887
    
        # title = '有着一双大长腿是什么感觉?'
        # question_id = 292901966
    
        # title = '拥有一双大长腿是怎样的体验?'
        # question_id = 285321190
    
        # title = '大胸女生如何穿衣搭配?'
        # question_id = 26297181
    
        # title = '胸大到底怎么穿衣服好看?'
        # question_id = 293482116
    
        zhihu_url = "https://www.zhihu.com/question/{qid}".format(qid=question_id)
        path = str(question_id) + '_' + title
        mkdir(path)  # 创建本地文件夹
        img_list = get_image_url(question_id, headers)  # 获取图片的地址列表
        download_pic2(img_list, path)  # 保存图片
     
  • 相关阅读:
    从点滴看管理之新生代员工培养方式的思考
    手机应用PC端演示工具介绍
    Python Day 66 Django框架、Auth认证模块、Auth模块常用方法、扩展默认的auth_user表、补充orm 模型类中releatename属性和 自关联
    Python Day 65 Django框架、Django生命周期、Django中间件、中间件执行流程、Django中MTV模式 和 MVC模式
    Python Day 64 Django框架、cookie和session、Django中Session相关方法、Django中支持Session5种存储介质
    Python Day 63 Django框架、Django模板系统(渲染页面的作用)
    Python Day 62 Django框架、Django框架中分页 、 网页攻击
    Python Day 61 Django框架、Django框架ORM一对一表操作、Django列类型(重点)、自定义列类型、Django-amdin自带管理后台
    Python Day 60 Django框架、ORM高级查询、级联删除、增加多条数据、Django中Xss攻击、事务
    Python Day 59 Django框架、Django中ORM多对多表操作(联合唯一索引)
  • 原文地址:https://www.cnblogs.com/xingkongzhizhu/p/11141774.html
Copyright © 2020-2023  润新知