• 爬取知乎Python中文社区信息


     爬取知乎Python中文社区信息,https://zhuanlan.zhihu.com/zimei

     1 import requests
     2 from urllib.parse import urlencode
     3 from pyquery import PyQuery as pq
     4 from pymongo import MongoClient
     5 import json
     6 import time
     7 
     8 base_url = 'https://www.zhihu.com/api/v4/columns/zimei/articles?limit=10&'
     9 headers = {
    10     'authority': 'www.zhihu.com',
    11     'referer': 'https://zhuanlan.zhihu.com/zimei',
    12     'origin': 'https://zhuanlan.zhihu.com',
    13     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    14 }
    15 
    16 client = MongoClient()
    17 db = client['zhihu']
    18 collection = db['zhihu']
    19 max_page = 100
    20 
    21 
    22 def get_page(page):
    23     params = {
    24         'offset': page*10
    25     }
    26     url = base_url + urlencode(params)
    27     try:
    28         response = requests.get(url, headers=headers)
    29         if response.status_code == 200:
    30 
    31             return response.json()
    32     except requests.ConnectionError as e:
    33         print('Error', e.args)
    34 
    35 
    36 def parse_page(json_1):
    37     if json_1:
    38         items = json_1.get('data')
    39         for item in items:
    40             if page == 1 :
    41                 continue
    42             else:
    43                 
    44                 zhihu = {}
    45                 zhihu['name'] = item.get('author').get('name')
    46                 zhihu['title'] = item.get('title')
    47                 zhihu['text'] = pq(item.get('excerpt')).text()
    48                 zhihu['comments'] = item.get('comment_count')
    49                 zhihu['reposts'] = item.get('voteup_count')
    50                 zhihu['data'] = time.strftime('%Y-%m-%d %H%:%M',time.localtime(item.get('updated')))
    51                 yield zhihu
    52 
    53 def write_to_file(content):
    54     with open('zhihu.json','a',encoding='utf-8') as f:
    55         f.write(json.dumps(content,ensure_ascii=False)+'
    ')
    56         f.close()
    57 
    58 def save_to_mongo(result):
    59     if collection.insert(result):
    60         print('Saved to Mongo')
    61 
    62 
    63 if __name__ == '__main__':
    64     for page in range(1, max_page + 1):
    65         json_1 = get_page(page)
    66 
    67         results = parse_page(json_1)
    68         for result in results:
    69             print(result)
    70             write_to_file(result)
    71             save_to_mongo(result)

     

  • 相关阅读:
    React使用iconfont图标下载到本地symbol引用
    【汇编】求100以内的素数asm
    jQuery Ajax.BeginForm方法回调函数高版本3.3.1不兼容问题
    python中的深拷贝与浅拷贝
    闲来无事做个C#小项目——2
    C#使用MD5加密
    数据结构部分总结(c语言版)
    vue 上传视频和图片 并且截取视频第一帧作为播放前默认图片
    vue el-cascader取id和lable的值
    C# 枚举的定义,枚举的用法,获取枚举值
  • 原文地址:https://www.cnblogs.com/wanglinjie/p/9226691.html
Copyright © 2020-2023  润新知