• 爬取知乎话题async使用协程


    import requests
    import json
    import time
    from pyquery import PyQuery
    import pandas as pd
    from collections import OrderedDict
    import multiprocessing
    import asyncio
    from functools import partial
    # cookies = input('请输入Cookie:')
    # url = input('请输入url:')
    init_url = 'https://www.zhihu.com/api/v4/topics/19562045/feeds/top_activity?offset=5&limit=10'
    headers = {
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
        'Cookie': '**',
        'Referer': 'https://www.zhihu.com/topic/19606409/hot',
        'Host': 'www.zhihu.com',
        'X-UDID': 'AGDlzA1itw2PTr6aWsPp6OtejkxQ9iF7xgA='
    }
    
    def get_all_url(url):
        res = requests.get(url,headers=headers)
        data = json.loads(res.text)
        next_page_url = data['paging']['next']
        url_list.append(next_page_url)
        print(len(url_list))
        end_page = data['paging']['is_end']  # true
        if end_page:
            return url_list
        else:
            get_all_url(next_page_url)
    
    
    
    async def get_all_data(url):
        future = loop.run_in_executor(None,partial(requests.get,url,headers=headers))
        #res = requests.get(url,headers=headers)
        res = await future
        data = json.loads(res.text)
        res_data = data['data']
        print(len(data_list))
        for i in res_data:
            final_data = OrderedDict()
            type = i['target']['type']
            if type =='answer':
                final_data['title'] = i['target']['question']['title'] or ''
                try:
                    final_data['content'] = PyQuery(i['target']['content']).text()
                except Exception as e:
                    final_data['content'] = PyQuery(i['target']['excerpt']).text()
                final_data['comment_count'] = i['target']['comment_count']
                final_data['voteup_count'] = i['target']['voteup_count']
                data_list.append(final_data)
    
    if __name__ == '__main__':
        data_list=[]
        url_list = []
        get_all_url(init_url)
    
        tasks = [asyncio.ensure_future(get_all_data(url)) for url in url_list]
        loop = asyncio.get_event_loop()
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()
    
        df1 =pd.DataFrame(data_list)
        df1.to_excel('保险'+time.strftime("%Y%m%d%H%M%S")+'.xlsx',index=False)
        print('done')
  • 相关阅读:
    PostgreSQL pg_ident.conf 文件简析
    使用 iptables 限制黑客猜密码续—深入 recent 模块
    从零开始安装 Drupal 7
    使用tween.js移动three.js相机创建转场动画
    容器化导致RocketMQ消息囤积的原因和解决方案
    linux序章(第一集)
    使用DockerFile 构建nginx镜像
    git的常用指令
    使用docker起一个mysql服务
    Windows 8自动登录
  • 原文地址:https://www.cnblogs.com/Erick-L/p/9415677.html
Copyright © 2020-2023  润新知