• 异步爬取爱卡汽车论坛信息


    一、获取论坛对应的汽车fid

    import asyncio
    import time
    import requests
    from bs4 import BeautifulSoup
    from urllib.parse import urljoin
    import json
    import re
    
    headers = {
        "Host": "www.xcar.com.cn",
        "Pragma": "no-cache",
        "Proxy-Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
    }
    
    
    res=requests.get(r"https://www.xcar.com.cn/bbs/",headers=headers)
    html=res.content.decode(res.apparent_encoding)
    
    soup=BeautifulSoup(html,'lxml')
    span_list=soup.find_all("span",id="w959")
    car_ids={}
    
    for span in span_list:
        fid=re.findall("fid=(.*)",span.find("a").attrs.get('href'))[0]
        name=span.find("a").text
        car_ids[fid]=name
    
    # 保存为本地json文件,方便后续使用
    car_ids_str=json.dumps(car_ids,ensure_ascii=False)
    with open("car.json", "w",encoding="utf-8") as f:
        f.write(car_ids_str)

    二、asyncio爬取论坛信息

    headers = {
        "Host": "www.xcar.com.cn",
        "Pragma": "no-cache",
        "Proxy-Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
    }
    
    with open("car.json", "rb") as f:
        car_ids = json.loads(f.read())
    
    # 异步爬取函数
    async def more_details(fid,page):
        _time=str(time.time()*1000)
        url = r"https://www.xcar.com.cn/bbs/xbbsapi/forumdisplay/get_thread_list.php"
        params = {
            "fid": fid,
            "orderby": "lastpost",
            "filter": "",
            "ondigest": "0",
            "page": page,
            "_": _time
        }
        res = requests.get(url=url, params=params,headers=headers)
        return res.json()
    
    def run(car_name):
        for k, v in car_ids.items():
            if car_name in v:
                car_id = k
                # 创建异步循环事件池
                loop = asyncio.get_event_loop()
                # 使用ensure_future创建异步爬取任务task,最后通过result()来获取结果
                task=[asyncio.ensure_future(more_details(car_id,i)) for i in range(1,50)]
                done, _ =loop.run_until_complete(asyncio.wait(task))
                for t in done:
                    res=t.result()
                    print(res)
                loop.close()
    run("奥迪A4L")
  • 相关阅读:
    Java 中日常使用的 IO 流总结
    NIO 实现非阻塞 Socket 通讯
    Java NIO 的简单介绍和使用
    常用设计模式 -- 一分钟就能学会的门面模式(外观模式)
    Java日志框架介绍和 Slf4j 使用
    Linux学习一
    JavaScript-数组
    javascript
    idea 快捷键汇总
    正则表达式
  • 原文地址:https://www.cnblogs.com/angelyan/p/14216566.html
Copyright © 2020-2023  润新知