• python爬取拉勾网职位信息-python相关职位


    import requests
    import math
    import pandas as pd
    import time
    from lxml import etree

    url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false'
    headers = {
    'Accept': "application/json, text/javascript, */*; q=0.01",
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
    'Referer':'https://www.lagou.com/jobs/list_python/p-city_0?px=default'
    }
    form_data = {
    'first': 'true',
    'pn': 1,
    'kd': 'python'
    }
    s = requests.session()
    s.get('https://www.lagou.com/jobs/list_python/p-city_0?px=default#filterBox', headers=headers, timeout=3)
    cookie = s.cookies
    response = s.post(url, data=form_data, headers=headers, cookies=cookie, timeout=3)
    job_json = response.json()
    job_totalCount = job_json['content']['positionResult']['totalCount']
    job_pageCount = math.ceil(job_totalCount/15)
    company_info = []
    for i in range(1,job_pageCount+1):
    form_data['pn'] = i
    s = requests.session()
    s.get('https://www.lagou.com/jobs/list_python/p-city_0?px=default#filterBox', headers=headers,
    timeout=3)
    cookie = s.cookies
    response = s.post(url, data=form_data, headers=headers, cookies=cookie, timeout=3)
    job_json = response.json()
    print(i,job_json)
    job_list = job_json['content']['positionResult']['result']
    for job in job_list:
    job_info = []
    job_info.append(job['companyFullName'])
    job_info.append(job['companySize'])
    job_info.append(job['financeStage'])
    job_info.append(job['district'])
    job_info.append(job['positionName'])
    job_info.append(job['workYear'])
    job_info.append(job['education'])
    job_info.append(job['salary'])
    job_info.append(job['positionAdvantage'])
    positionId=job['positionId']
    job_detail_url = 'https://www.lagou.com/jobs/'+str(positionId)+'.html?show=10faf2bed17a459bbf40e09529f61edd'
    response1 = s.post(job_detail_url,data=form_data, headers=headers, cookies=cookie, timeout=3)
    root = etree.HTML(response1.text)
    job_detail = root.xpath('//div[@class="job-detail"]/text()')
    work_addr = root.xpath('//div[@class="work_addr"]/text()')
    job_info.append(job_detail)
    job_info.append(work_addr)
    #print('===============', job_detail,work_addr)
    print(job_info)
    company_info.append(job_info)
    time.sleep(1)
    print(company_info)
    datas = pd.DataFrame(columns=['公司','规模','融资','位置','职位','经验','学历','工资','福利','职位描述','工作地点'],data=company_info)
    datas.to_excel(r'C:UsersxxxDesktopout/lagou_1.xlsx')
  • 相关阅读:
    mysql MHA报错 Can't exec "mysqlbinlog": No such file or directory at /usr/local/share/perl5/MHA/BinlogManager.pm line 99.
    树莓派搭建私人服务器
    动手写简单的嵌入式操作系统一
    java 返回json数据
    C语言中内存分配
    IntelliJ IDEA14.0.3+Maven+SpringMVC+Spring+Hibernate光速构建Java权限管理系统(三)
    linux设备驱动归纳总结
    阿里云centos6.5下搭建javaWeb运行环境
    JAVAWEB项目如何实现验证码
    Linux驱动开发:USB驱动之usb_skel分析
  • 原文地址:https://www.cnblogs.com/Merge-1126/p/13637558.html
Copyright © 2020-2023  润新知