• 异步爬取爱卡汽车论坛信息


    一、获取论坛对应的汽车fid

    import asyncio
    import time
    import requests
    from bs4 import BeautifulSoup
    from urllib.parse import urljoin
    import json
    import re
    
    headers = {
        "Host": "www.xcar.com.cn",
        "Pragma": "no-cache",
        "Proxy-Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
    }
    
    
    res=requests.get(r"https://www.xcar.com.cn/bbs/",headers=headers)
    html=res.content.decode(res.apparent_encoding)
    
    soup=BeautifulSoup(html,'lxml')
    span_list=soup.find_all("span",id="w959")
    car_ids={}
    
    for span in span_list:
        fid=re.findall("fid=(.*)",span.find("a").attrs.get('href'))[0]
        name=span.find("a").text
        car_ids[fid]=name
    
    # 保存为本地json文件,方便后续使用
    car_ids_str=json.dumps(car_ids,ensure_ascii=False)
    with open("car.json", "w",encoding="utf-8") as f:
        f.write(car_ids_str)

    二、asyncio爬取论坛信息

    headers = {
        "Host": "www.xcar.com.cn",
        "Pragma": "no-cache",
        "Proxy-Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
    }
    
    with open("car.json", "rb") as f:
        car_ids = json.loads(f.read())
    
    # 异步爬取函数
    async def more_details(fid,page):
        _time=str(time.time()*1000)
        url = r"https://www.xcar.com.cn/bbs/xbbsapi/forumdisplay/get_thread_list.php"
        params = {
            "fid": fid,
            "orderby": "lastpost",
            "filter": "",
            "ondigest": "0",
            "page": page,
            "_": _time
        }
        res = requests.get(url=url, params=params,headers=headers)
        return res.json()
    
    def run(car_name):
        for k, v in car_ids.items():
            if car_name in v:
                car_id = k
                # 创建异步循环事件池
                loop = asyncio.get_event_loop()
                # 使用ensure_future创建异步爬取任务task,最后通过result()来获取结果
                task=[asyncio.ensure_future(more_details(car_id,i)) for i in range(1,50)]
                done, _ =loop.run_until_complete(asyncio.wait(task))
                for t in done:
                    res=t.result()
                    print(res)
                loop.close()
    run("奥迪A4L")
  • 相关阅读:
    spark集群安装部署
    CentOS7 下安装GUI图形界面GNOME
    Wininet请求包装类简稿
    Wininet下载类初稿
    MAC 编制计划任务
    弹出式窗口管理单元备忘
    rc资源文件的中英文应用备忘
    自备工具库
    界面方面的备忘
    多屏开发的备忘
  • 原文地址:https://www.cnblogs.com/angelyan/p/14216566.html
Copyright © 2020-2023  润新知