python3爬虫-通过requests获取拉钩职位信息

import requests, json, time, tablib


def send_ajax_request(data: dict):
    try:
        ajax_response = session.post(url=ajax_url,
                                     params={"needAddtionalResult": "false", "city": city},
                                     data=data,
                                     headers=ajax_headers,
                                     timeout=timeout)
        if ajax_response.status_code == 200:
            return ajax_response.json()
        return {}
    except Exception:
        return {}


def get_job_info(info_dic: dict):
    jobInfoMap = info_dic.get("content").get("positionResult").get("result")

    for jobInfoDict in jobInfoMap:
        dic = {}
        dic["companyId"] = jobInfoDict.get("companyId")
        dic["companyFullName"] = jobInfoDict.get("companyFullName")
        dic["positionName"] = jobInfoDict.get("positionName")
        dic["workYear"] = jobInfoDict.get("workYear")
        dic["education"] = jobInfoDict.get("education")
        dic["salary"] = jobInfoDict.get("salary")
        dic["jobNature"] = jobInfoDict.get("jobNature")
        dic["companySize"] = jobInfoDict.get("companySize")
        dic["city"] = jobInfoDict.get("city")
        dic["district"] = jobInfoDict.get("district")
        dic["createTime"] = jobInfoDict.get("createTime")
        if is_save_txtfile:
            yield json.dumps(dic, ensure_ascii=False)
        else:
            yield dic.values()


def save_to_file(json_data):
    for data in json_data:
        f.write(data + "
")


def save_to_excel(list_data):
    for line in list_data:
        dataset.append(line)


def run():
    for i in range(1, 31):
        data = {
            "first": "false",
            "pn": i,
            "kd": "python"
        }
        info_dic = send_ajax_request(data)
        data = get_job_info(info_dic)
        if is_save_txtfile:
            save_to_file(data)
        else:
            save_to_excel(data)
        print("正在保存数据")
        time.sleep(sleeptime)


if __name__ == '__main__':
    session = requests.Session()
    job_name = "python"
    city = "成都"
    timeout = 5
    sleeptime = 10
    doc_url = "https://www.lagou.com/jobs/list_{job_name}".format(job_name=job_name)
    session.headers[
        "User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    session.headers["Host"] = "www.lagou.com"

    doc_response = session.get(url=doc_url, params={"city": city})

    ajax_headers = {
        "Origin": "https://www.lagou.com",
        "Referer": doc_response.url
    }

    ajax_url = "https://www.lagou.com/jobs/positionAjax.json?=false"

    is_save_txtfile = False

    if not is_save_txtfile:
        dataset = tablib.Dataset()
        dataset.headers = ["companyId", "companyFullName", "positionName", "workYear",
                           "education", "salary", "jobNature", "companySize", "city",
                           "district", "createTime"]

    f = open("jobinfo.txt", "a", encoding="utf-8")
    try:
        run()
    except Exception:
        print('出错了')
    finally:
        if is_save_txtfile:
            f.close()
        else:
            with open("jobInfo.xls", "wb") as f:
                f.write(dataset.xls)
                f.flush()

相关阅读:
Azure 虚拟机诊断设置问题排查
 虚拟机压力测试延迟高的可能原因及 ILPIP 配置 / 查询脚本
 理解 Azure 平台中虚拟机的计算能力
 如何复制或导出托管磁盘
 理解托管磁盘的原理与优势
 Azure 托管镜像和非托管镜像对比
 如何将同一云服务下的虚拟机从经典部署模型迁移到 Azure Resource Manager
如何将同一 VNET 下的虚拟机从经典部署模型迁移到 Azure Resource Manager
如何将使用托管磁盘虚拟机的 OS 盘挂载到其他虚拟机上
 基于 Azure 托管磁盘配置高可用共享文件系统
原文地址：https://www.cnblogs.com/zhuchunyu/p/10765945.html