• 拉勾网爬虫


    # -*- coding: utf-8 -*-
    # TODO https://www.lagou.com/wn/jobs?kd=Java&city=%E5%85%A8%E5%9B%BD
    # @Date    : 2022/4/25 9:53
    # @Author  : layman
    import requests
    import json
    from lxml import etree
    
    
    def getNextUrl(kd, pn):
        headers = {
            'Referer': 'https://www.lagou.com/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36'
        }
        parms = {
            'pn': pn,
            'cl': 'false',
            'fromSearch': 'true',
            'kd': kd
        }
        url = 'https://www.lagou.com/wn/jobs'
        headers = {
            'origin': 'https://www.lagou.com',
            'referer': f'https://www.lagou.com/wn/jobs?kd={kd}&city=%E5%85%A8%E5%9B%BD',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
            'cookie': 'RECOMMEND_TIP=true; user_trace_token=20220220111830-3bc268fdsfsfsfd2-e379bb0a6e1e; LGUID=20220220111830-4acc255d-b370-468d-8d4d-517f0755b875; _ga=GA1.2.1717447248.1645327108; smidV2=20220313164640140aa5fbc1e260b461e911b961866f1c009ed560315758d80; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAAECABFAACEA4414CB00A33EF5FE9D415B7DD089665E; WEBTJ-ID=20220425093829-1805e5ec1d52f8-048da4ab60938c-9771a3f-1327104-1805e5ec1d64d6; PRE_UTM=; PRE_HOST=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGSID=20220425093831-e2ffe5b0-9198-47ad-bc84-a23cbcf5ff18; PRE_SITE=https%3A%2F%2Fwww.lagou.com; _gid=GA1.2.952409338.1650850711; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1649565890,1649923373,1650850711; privacyPolicyPopup=false; sensorsdata2015session=%7B%7D; LG_LOGIN_USER_ID=576eb66efed94bf89ae4f0f382542744027d9d3bda167101a9c1ab4c84eeda03; LG_HAS_LOGIN=1; _putrc=7353C66353E1FA2E123F89F2B170EADC; login=true; unick=%E5%BC%A0%E9%A1%BA; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=7; __SAFETY_CLOSE_TIME__19393310=1; gate_login_token=9166364f3eefd44bafa66a710af72feb9811c092a785c503d44e4379fca7a353; TG-TRACK-CODE=index_navigation; __lg_stoken__=8382b3ec8ec3d4b5622e9c1f6a8747ee723a44d4e7596be643eba21610c0f3b7a03abe1c2f9b1c0e856ead2d2d283c9f09ece56180188eb1cccf60ebe129a7d32e12d6d5da54; X_HTTP_TOKEN=d65cba845dbca000770158056188260baa07dfe93c; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1650851076; LGRID=20220425094441-b47f8533-991b-489e-a13a-493c1eb35141; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2219393310%22%2C%22first_id%22%3A%2217f15234974228-0deb05b76a48e9-576153e-1327104-17f152349759f3%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2299.0.4844.82%22%7D%2C%22%24device_id%22%3A%2217f15234974228-0deb05b76a48e9-576153e-1327104-17f152349759f3%22%7D'}
        response = requests.get(url=url, headers=headers, params=parms)
        html = etree.HTML(response.text)
        json_str = html.xpath('//script[@id="__NEXT_DATA__"]/text()')
        url_list = []
        json_data = json.loads(json_str[0])
        content = json_data["props"]["pageProps"]["initData"]["content"]["hrInfoMap"]
        for key in content:
            url = 'https://www.lagou.com/wn/jobs/' + key + '.html'
            # print(url)
            url_list.append(url)
    
    
    # url_list = getNextUrl(kd='Java', pn=4)
    
    
    def getDetail(url):
        headers = {
            'origin': 'https://www.lagou.com',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
            'cookie': 'RECOMMEND_TIP=true; user_trace_token=20220220111830-3bc26860-d88rerere379bb0a6e1e; LGUID=20220220111830-4acc255d-b370-468d-8d4d-517f0755b875; _ga=GA1.2.1717447248.1645327108; smidV2=20220313164640140aa5fbc1e260b461e911b961866f1c009ed560315758d80; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAAECABFAACEA4414CB00A33EF5FE9D415B7DD089665E; WEBTJ-ID=20220425093829-1805e5ec1d52f8-048da4ab60938c-9771a3f-1327104-1805e5ec1d64d6; PRE_UTM=; PRE_HOST=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGSID=20220425093831-e2ffe5b0-9198-47ad-bc84-a23cbcf5ff18; PRE_SITE=https%3A%2F%2Fwww.lagou.com; _gid=GA1.2.952409338.1650850711; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1649565890,1649923373,1650850711; privacyPolicyPopup=false; sensorsdata2015session=%7B%7D; LG_LOGIN_USER_ID=576eb66efed94bf89ae4f0f382542744027d9d3bda167101a9c1ab4c84eeda03; LG_HAS_LOGIN=1; _putrc=7353C66353E1FA2E123F89F2B170EADC; login=true; unick=%E5%BC%A0%E9%A1%BA; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=7; __SAFETY_CLOSE_TIME__19393310=1; gate_login_token=9166364f3eefd44bafa66a710af72feb9811c092a785c503d44e4379fca7a353; TG-TRACK-CODE=index_navigation; __lg_stoken__=8382b3ec8ec3d4b5622e9c1f6a8747ee723a44d4e7596be643eba21610c0f3b7a03abe1c2f9b1c0e856ead2d2d283c9f09ece56180188eb1cccf60ebe129a7d32e12d6d5da54; X_HTTP_TOKEN=d65cba845dbca000770158056188260baa07dfe93c; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1650851076; LGRID=20220425094441-b47f8533-991b-489e-a13a-493c1eb35141; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2219393310%22%2C%22first_id%22%3A%2217f15234974228-0deb05b76a48e9-576153e-1327104-17f152349759f3%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2299.0.4844.82%22%7D%2C%22%24device_id%22%3A%2217f15234974228-0deb05b76a48e9-576153e-1327104-17f152349759f3%22%7D'}
        resp = requests.get(url, headers=headers)
        html = etree.HTML(resp.text)
        description = html.xpath('//*[@id="job_detail"]//text()')
        print(description)
    
    
    getDetail('https://www.lagou.com/wn/jobs/7999778.html')
    
    
  • 相关阅读:
    Linux- 恢复.swp文件
    codeforces contest 1111
    bzoj2589【 Spoj 10707】 Count on a tree II
    20190129模拟题
    loj6070【山东集训第一轮Day4】基因
    bzoj4784【zjoi2017】仙人掌
    bzoj4520【cqoi2016】K远点对
    【学习笔记】BEST定理
    bzoj2441【中山市选】小W的问题
    bzoj3203【sdoi2013】保护出题人
  • 原文地址:https://www.cnblogs.com/shun998/p/16189626.html
Copyright © 2020-2023  润新知