写了个爬取知乎热榜的爬虫,将热榜信息存成json格式(update 1)保存在当前目录下,根据爬取时间存取
需要cookie换成自己的应该就能用了
爬取的内容有Rank:当前热榜排名 Title:问题名称 Hot:当前问题热度 Url:问题链接 Tags:问题的tags(点进问题之后可以看到)
update 1新增了Ans: 两个热门答案,并修改了json格式
代码
# coding:utf-8
# author:graykido
# data:2021.5.25
from bs4 import BeautifulSoup
import re
import requests
import os
import urllib.request
import random
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Upgrade-Insecure-Requests': '1',
'Set-Cookie': '',
'cookie': ''}
url = "https://www.zhihu.com/hot"
plus = ["", "?list=science", "?list=digital", "?list=sport", "?list=fashion", "?list=film", "?list=school", "?list=car",
"?list=depth", "?list=focus"]
plusNameInCn = ["全站", "科学", "数码", "体育", "时尚", "影视", "校园", "汽车", "时事", "国际"]
for i in range(len(plus)):
myRec2Txt = ""
new_url = url + plus[i]
name = plusNameInCn[i]
print(name + ": ")
r = requests.get(new_url, headers=headers)
r.encoding = ('utf8')
bs = BeautifulSoup(r.text, "lxml")
sections = bs.find_all(class_="HotItem")
for section in sections:
# print(section)
# soup2=BeautifulSoup(div.text,'lxml')
# print(soup2.getText())
tags = []
rank = section.div.div.text
title = section.a.get('title')
heat = section.find(class_="HotItem-metrics HotItem-metrics--bottom")
if heat == None:
heat = section.find(class_="HotItem-metrics")
heat_ar = heat.text.split(' ')
heat = heat_ar[0] + "万热度"
tag_url = section.find(class_="HotItem-content").a['href']
r2 = requests.get(tag_url, headers=headers)
r2.encoding = ('utf8')
soup2 = BeautifulSoup(r2.text, 'lxml')
try:
tags_divs = soup2.find(class_="QuestionHeader-topics").find_all(class_="Popover")
for tags_div in tags_divs:
tags_true = tags_div.find(id='null-toggle').text
tags.append(tags_true)
myRec = {}
myRec['Rank'] = rank
myRec['Title'] = title
myRec['Hot'] = heat
myRec['Url'] = tag_url
myRec['Tags'] = tags
myRec2Txt += str(myRec) + "
"
except:
print("error发生")
fold_path = './' + name
# 判断是否存在该文件夹
if not os.path.exists(fold_path):
print("正在创建文件夹...")
os.makedirs(fold_path)
filepath = fold_path + '/' + time.strftime("%Y-%m-%d %H_%M_%S", time.localtime()) + ".txt"
if os.path.exists(filepath):
print("已存在该文件")
else:
with open(filepath, "w") as f:
f.write(str(myRec2Txt))
print("成功创建文件")
print("just like another saturday night,mission finshed!")
update 1
# coding:utf-8
# author:graykido
# data:2021.5.25
# update1:2021.6.3
from bs4 import BeautifulSoup
import re
import requests
import os
import urllib.request
import random
import time
import json
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Upgrade-Insecure-Requests': '1',
'Set-Cookie': '',
'cookie': ''}
url = "https://www.zhihu.com/hot"
plus = ["", "?list=science", "?list=digital", "?list=sport", "?list=fashion", "?list=film", "?list=school", "?list=car",
"?list=depth", "?list=focus"]
plusNameInCn = ["全站", "科学", "数码", "体育", "时尚", "影视", "校园", "汽车", "时事", "国际"]
for i in range(len(plus)):
myRec2Json = []
new_url = url + plus[i]
name = plusNameInCn[i]
# print(name + ": ")
r = requests.get(new_url, headers=headers)
r.encoding = ('utf8')
# print(r.text)
bs = BeautifulSoup(r.text, "lxml")
sections = bs.find_all(class_="HotItem")
for section in sections:
# print(section)
# soup2=BeautifulSoup(div.text,'lxml')
# print(soup2.getText())
tags = []
rank = section.div.div.text
title = section.a.get('title')
heat = section.find(class_="HotItem-metrics HotItem-metrics--bottom")
if heat == None:
heat = section.find(class_="HotItem-metrics")
heat_ar = heat.text.split(' ')
heat = heat_ar[0] + "万热度"
tag_url = section.find(class_="HotItem-content").a['href']
r2 = requests.get(tag_url, headers=headers)
r2.encoding = ('utf8')
soup2 = BeautifulSoup(r2.text, 'lxml')
try:
tags_divs = soup2.find(class_="QuestionHeader-topics").find_all(class_="Popover")
for tags_div in tags_divs:
tags_true = tags_div.find(id='null-toggle').text
tags.append(tags_true)
resOfQues = []
resBs = soup2.find_all(class_="RichContent-inner")
cnt = 0
for ans in resBs:
if cnt > 2:
break
cnt += 1
resOfQues.append(ans.text)
myRec = {}
myRec['Rank'] = rank
myRec['Title'] = title
myRec['Hot'] = heat
myRec['Url'] = tag_url
myRec['Tags'] = tags
myRec['Ans'] = resOfQues
myRec2Json.append(myRec)
except:
print("error发生")
fold_path = './' + time.strftime("%Y/%m/%d")
# 判断是否存在该文件夹
if not os.path.exists(fold_path):
print("正在创建文件夹...")
os.makedirs(fold_path)
filepath = fold_path + '/' + name + ".json"
if os.path.exists(filepath):
print("已存在该文件")
else:
with open(filepath, "w", encoding="utf-8") as f:
json.dump(myRec2Json, f, ensure_ascii=False)
print("成功创建文件")
print("just like another saturday night,mission finshed!")