一、获取论坛对应的汽车fid
import asyncio
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
import re
headers = {
"Host": "www.xcar.com.cn",
"Pragma": "no-cache",
"Proxy-Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
}
res=requests.get(r"https://www.xcar.com.cn/bbs/",headers=headers)
html=res.content.decode(res.apparent_encoding)
soup=BeautifulSoup(html,'lxml')
span_list=soup.find_all("span",id="w959")
car_ids={}
for span in span_list:
fid=re.findall("fid=(.*)",span.find("a").attrs.get('href'))[0]
name=span.find("a").text
car_ids[fid]=name
# 保存为本地json文件,方便后续使用
car_ids_str=json.dumps(car_ids,ensure_ascii=False)
with open("car.json", "w",encoding="utf-8") as f:
f.write(car_ids_str)
二、asyncio爬取论坛信息
headers = {
"Host": "www.xcar.com.cn",
"Pragma": "no-cache",
"Proxy-Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
}
with open("car.json", "rb") as f:
car_ids = json.loads(f.read())
# 异步爬取函数
async def more_details(fid,page):
_time=str(time.time()*1000)
url = r"https://www.xcar.com.cn/bbs/xbbsapi/forumdisplay/get_thread_list.php"
params = {
"fid": fid,
"orderby": "lastpost",
"filter": "",
"ondigest": "0",
"page": page,
"_": _time
}
res = requests.get(url=url, params=params,headers=headers)
return res.json()
def run(car_name):
for k, v in car_ids.items():
if car_name in v:
car_id = k
# 创建异步循环事件池
loop = asyncio.get_event_loop()
# 使用ensure_future创建异步爬取任务task,最后通过result()来获取结果
task=[asyncio.ensure_future(more_details(car_id,i)) for i in range(1,50)]
done, _ =loop.run_until_complete(asyncio.wait(task))
for t in done:
res=t.result()
print(res)
loop.close()
run("奥迪A4L")