• Python爬虫(六)


    源码:

     1 import requests
     2 import re
     3 from my_mysql import MysqlConnect
     4 
     5 
     6 # 获取问答信息
     7 def get_contents(page,headers):
     8     url = 'https://www.zhihu.com/api/v4/members/chen-lu-ya-26/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={}&limit=20&sort_by=created'.format(page)
     9     req = requests.get(url,headers=headers)
    10     html_json_dict = req.json()
    11     # print(html_json_dict)
    12     data_list = html_json_dict['data']
    13     contents = []
    14     for item in data_list:
    15         question = item['question']['title']
    16         excerpt = item['excerpt']
    17         if '<' in excerpt:
    18             pat = r'(.*?)<.*>(.*)'
    19             res = re.search(pat, excerpt)
    20             front = res.group(1)
    21             back = res.group(2)
    22             pat = r'<.*?>(.*?)<.*?>'
    23             res = re.findall(pat, excerpt)
    24             middle = ' '.join(res)
    25             excerpt = front + middle + back
    26         contents.append((question,excerpt))
    27     return contents
    28 
    29 if __name__ == '__main__':
    30     headers = {
    31         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    32     }
    33     mc = MysqlConnect('127.0.0.1','root','123456','homework')
    34     for page in range(0,20*8,20):
    35         contents = get_contents(page, headers)
    36         # print(contents)
    37         for content in contents:
    38             sql = 'insert into zhihu values(null,%s,%s)'
    39             mc.exec_data(sql,content)
    40             print(content)
  • 相关阅读:
    Codeforces 1316B String Modification
    Codeforces 1305C Kuroni and Impossible Calculation
    Codeforces 1305B Kuroni and Simple Strings
    Codeforces 1321D Navigation System
    Codeforces 1321C Remove Adjacent
    Codeforces 1321B Journey Planning
    Operating systems Chapter 6
    Operating systems Chapter 5
    Abandoned country HDU
    Computer HDU
  • 原文地址:https://www.cnblogs.com/zhxd-python/p/9501313.html
Copyright © 2020-2023  润新知