# -*- coding: utf-8 -*-
# TODO https://www.lagou.com/wn/jobs?kd=Java&city=%E5%85%A8%E5%9B%BD
# @Date : 2022/4/25 9:53
# @Author : layman
import requests
import json
from lxml import etree
def getNextUrl(kd, pn):
headers = {
'Referer': 'https://www.lagou.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36'
}
parms = {
'pn': pn,
'cl': 'false',
'fromSearch': 'true',
'kd': kd
}
url = 'https://www.lagou.com/wn/jobs'
headers = {
'origin': 'https://www.lagou.com',
'referer': f'https://www.lagou.com/wn/jobs?kd={kd}&city=%E5%85%A8%E5%9B%BD',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'cookie': 'RECOMMEND_TIP=true; user_trace_token=20220220111830-3bc268fdsfsfsfd2-e379bb0a6e1e; LGUID=20220220111830-4acc255d-b370-468d-8d4d-517f0755b875; _ga=GA1.2.1717447248.1645327108; smidV2=20220313164640140aa5fbc1e260b461e911b961866f1c009ed560315758d80; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAAECABFAACEA4414CB00A33EF5FE9D415B7DD089665E; WEBTJ-ID=20220425093829-1805e5ec1d52f8-048da4ab60938c-9771a3f-1327104-1805e5ec1d64d6; PRE_UTM=; PRE_HOST=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGSID=20220425093831-e2ffe5b0-9198-47ad-bc84-a23cbcf5ff18; PRE_SITE=https%3A%2F%2Fwww.lagou.com; _gid=GA1.2.952409338.1650850711; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1649565890,1649923373,1650850711; privacyPolicyPopup=false; sensorsdata2015session=%7B%7D; LG_LOGIN_USER_ID=576eb66efed94bf89ae4f0f382542744027d9d3bda167101a9c1ab4c84eeda03; LG_HAS_LOGIN=1; _putrc=7353C66353E1FA2E123F89F2B170EADC; login=true; unick=%E5%BC%A0%E9%A1%BA; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=7; __SAFETY_CLOSE_TIME__19393310=1; gate_login_token=9166364f3eefd44bafa66a710af72feb9811c092a785c503d44e4379fca7a353; TG-TRACK-CODE=index_navigation; __lg_stoken__=8382b3ec8ec3d4b5622e9c1f6a8747ee723a44d4e7596be643eba21610c0f3b7a03abe1c2f9b1c0e856ead2d2d283c9f09ece56180188eb1cccf60ebe129a7d32e12d6d5da54; X_HTTP_TOKEN=d65cba845dbca000770158056188260baa07dfe93c; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1650851076; LGRID=20220425094441-b47f8533-991b-489e-a13a-493c1eb35141; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2219393310%22%2C%22first_id%22%3A%2217f15234974228-0deb05b76a48e9-576153e-1327104-17f152349759f3%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2299.0.4844.82%22%7D%2C%22%24device_id%22%3A%2217f15234974228-0deb05b76a48e9-576153e-1327104-17f152349759f3%22%7D'}
response = requests.get(url=url, headers=headers, params=parms)
html = etree.HTML(response.text)
json_str = html.xpath('//script[@id="__NEXT_DATA__"]/text()')
url_list = []
json_data = json.loads(json_str[0])
content = json_data["props"]["pageProps"]["initData"]["content"]["hrInfoMap"]
for key in content:
url = 'https://www.lagou.com/wn/jobs/' + key + '.html'
# print(url)
url_list.append(url)
# url_list = getNextUrl(kd='Java', pn=4)
def getDetail(url):
headers = {
'origin': 'https://www.lagou.com',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'cookie': 'RECOMMEND_TIP=true; user_trace_token=20220220111830-3bc26860-d88rerere379bb0a6e1e; LGUID=20220220111830-4acc255d-b370-468d-8d4d-517f0755b875; _ga=GA1.2.1717447248.1645327108; smidV2=20220313164640140aa5fbc1e260b461e911b961866f1c009ed560315758d80; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAAECABFAACEA4414CB00A33EF5FE9D415B7DD089665E; WEBTJ-ID=20220425093829-1805e5ec1d52f8-048da4ab60938c-9771a3f-1327104-1805e5ec1d64d6; PRE_UTM=; PRE_HOST=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGSID=20220425093831-e2ffe5b0-9198-47ad-bc84-a23cbcf5ff18; PRE_SITE=https%3A%2F%2Fwww.lagou.com; _gid=GA1.2.952409338.1650850711; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1649565890,1649923373,1650850711; privacyPolicyPopup=false; sensorsdata2015session=%7B%7D; LG_LOGIN_USER_ID=576eb66efed94bf89ae4f0f382542744027d9d3bda167101a9c1ab4c84eeda03; LG_HAS_LOGIN=1; _putrc=7353C66353E1FA2E123F89F2B170EADC; login=true; unick=%E5%BC%A0%E9%A1%BA; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=7; __SAFETY_CLOSE_TIME__19393310=1; gate_login_token=9166364f3eefd44bafa66a710af72feb9811c092a785c503d44e4379fca7a353; TG-TRACK-CODE=index_navigation; __lg_stoken__=8382b3ec8ec3d4b5622e9c1f6a8747ee723a44d4e7596be643eba21610c0f3b7a03abe1c2f9b1c0e856ead2d2d283c9f09ece56180188eb1cccf60ebe129a7d32e12d6d5da54; X_HTTP_TOKEN=d65cba845dbca000770158056188260baa07dfe93c; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1650851076; LGRID=20220425094441-b47f8533-991b-489e-a13a-493c1eb35141; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2219393310%22%2C%22first_id%22%3A%2217f15234974228-0deb05b76a48e9-576153e-1327104-17f152349759f3%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2299.0.4844.82%22%7D%2C%22%24device_id%22%3A%2217f15234974228-0deb05b76a48e9-576153e-1327104-17f152349759f3%22%7D'}
resp = requests.get(url, headers=headers)
html = etree.HTML(resp.text)
description = html.xpath('//*[@id="job_detail"]//text()')
print(description)
getDetail('https://www.lagou.com/wn/jobs/7999778.html')