爬虫项目:爬取并筛选拉钩网职位信息自动提交简历
一 目标站点分析
#一:实验前准备: 浏览器用Chrome 用Ctrl+Shift+Delete清除浏览器缓存的Cookie 打开network准备抓包,点击Preserve log保留所有日志 #二:拉勾网验证流程: 1、请求登录页面: 请求url为:https://passport.lagou.com/login/login.html 请求头并没有什么内容,带上简单的Host,User-Agent把自己伪装成浏览器即可 响应头里包含有效的cookie信息 Set-Cookie:JSESSIONID=ABAAABAAADGAACFC0077EDC55EEC248392A667B221CE7AB; Path=/; HttpOnly Set-Cookie:user_trace_token=20171104165207-d69fee97-d5d1-4a06-a406-e41989257b25; 页面内容里包含有用的: X-Anit-Forge-Code X-Anit-Forge-Token ps:可以从login.html的head标签里发现拉钩程序员的注释:为了防止重复提交请求与表单,正是这条注释为老娘提供了干它的灵感,可见有时候爱加注释并不是什么好事 2、提交用户名密码 请求url为:https://passport.lagou.com/login/login.json 请求头里需要携带: JESSIONID 'X-Anit-Forge-Code': X_Anti_Forge_Code, #从login.html页面内容中找 'X-Anit-Forge-Token': X_Anti_Forge_Token, #从login.html页面内容中找 'X-Requested-With': 'XMLHttpRequest', 请求体内data: 用户名密码 ps:用户名为明文,密码为密文,可以输错用户名,输对密码,然后在form data内获取正确的密文密码 Cookies: JSESSIONID user_trace_token 3、请求授权(上一步登录成功后,并没有被授权),拿到重定向的url 请求url为:https://passport.lagou.com/grantServiceTicket/grant.html 请求头: host user-agent 注意:授权成功后会重定向,如果重定向成功就完成登录了 4、请求重定向的url,拿到最终的登录session 老娘实现了两个版本,第一个版本完全用requests模拟浏览器的行为,但一些请求头与cookie的处理太繁琐了 于是老娘采用了第二个版本,直接用requests.session()去做
二 分析验证策略完成登录
import requests,re session = requests.Session() #步骤一、首先登陆login.html,获取cookie r1 = session.get('https://passport.lagou.com/login/login.html', headers={'Host': "passport.lagou.com",'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) X_Anti_Forge_Token = re.findall(r"window.X_Anti_Forge_Token = '(.*)';",r1.text)[0] X_Anti_Forge_Code = re.findall(r"window.X_Anti_Forge_Code = '(.*)';",r1.text)[0] #步骤二、用户登陆,携带上一次的cookie,后台对cookie中的 jsessionid 进行授权 r3 = session.post( url='https://passport.lagou.com/login/login.json', data={ 'isValidate': True, # 'username': '424662508@qq.com', # 'password': '4c4c83b3adf174b9c22af4a179dddb63', 'username':'18611453110', 'password':'bff642652c0c9e766b40e1a6f3305274', 'request_form_verifyCode': '', 'submit': '', }, headers={ 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest', "Referer": "https://passport.lagou.com/login/login.html", "Host": "passport.lagou.com", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", }, ) print(r3.text) # print(r3.headers) #步骤三:进行授权 r4 = session.get('https://passport.lagou.com/grantServiceTicket/grant.html', allow_redirects=False, headers={'Host': "passport.lagou.com",'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) # print(r4.headers) location=r4.headers['Location'] # print(location) #步骤四:请求重定向的地址,拿到最终的登录session r5= session.get(location, allow_redirects=True, headers={ 'Host': "www.lagou.com", 'Referer':'https://passport.lagou.com/login/login.html?', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) # print(r5.headers) #步骤五:验证登录 print('林海峰' in r5.text) #r5.text是重定向后的页面 r5=session.get('https://www.lagou.com') #基于已经拿到的session再登录就无需输入账号密码了 print('林海峰' in r5.text) r5=session.get('https://www.lagou.com') #基于已经拿到的session再登录就无需输入账号密码了 print('林海峰' in r5.text)
#使用requests.get(),自己处理cookie信息,流程是对的,可以正常登录,但是没有拿到想要的cookie信息,在爬取过程中发现拉勾网对请求头做了严格的限制,推测失败的原因极有可能是自己拼的请求头多了或者少了某个字段,于是果断采用requests.session() import re import time import requests # 一、访问登录页面,获取:cookie 、 X_Anti_Forge_Token、X_Anti_Forge_Code r1 = requests.get('https://passport.lagou.com/login/login.html', headers={'Host': "passport.lagou.com",'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) r1_cookie = r1.cookies.get_dict() X_Anti_Forge_Token = re.findall(r"window.X_Anti_Forge_Token = '(.*)';",r1.text)[0] X_Anti_Forge_Code = re.findall(r"window.X_Anti_Forge_Code = '(.*)';",r1.text)[0] r2 = requests.get('https://a.lagou.com/collect', cookies=r1_cookie, headers={'Host': "a.lagou.com",'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) r2_cookie=r2.cookies.get_dict() cookies={} cookies.update(r1_cookie) cookies.update(r2_cookie) print(cookies) # 二、输入用户名密码,登录 r3 = requests.post( url='https://passport.lagou.com/login/login.json', data={ 'isValidate': True, # 'username': '424662508@qq.com', # 'password': '4c4c83b3adf174b9c22af4a179dddb63', 'username':'18611453110', 'password':'bff642652c0c9e766b40e1a6f3305274', 'request_form_verifyCode': '', 'submit': '', }, headers={ 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest', "Referer": "https://passport.lagou.com/login/login.html", "Host": "passport.lagou.com", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", }, cookies=cookies ) # print(r3.text) # print(r3.cookies.get_dict()) r4 = requests.get('https://passport.lagou.com/grantServiceTicket/grant.html', cookies=cookies, allow_redirects=False, headers={'Host': "passport.lagou.com",'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) # print(r4.headers) location=r4.headers['Location'] r5= requests.get(location, cookies=cookies, allow_redirects=False, headers={ 'X_HTTP_TOKEN':'e6efc0e95eb87147209fbb4f22558fd1', 'Host': "www.lagou.com", 'Referer':'https://passport.lagou.com/login/login.html?', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) print(r5.headers)
三 基于登录爬取个人主页
import requests,re session = requests.Session() #步骤一、首先登陆login.html,获取cookie r1 = session.get('https://passport.lagou.com/login/login.html', headers={'Host': "passport.lagou.com",'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) X_Anti_Forge_Token = re.findall(r"window.X_Anti_Forge_Token = '(.*)';",r1.text)[0] X_Anti_Forge_Code = re.findall(r"window.X_Anti_Forge_Code = '(.*)';",r1.text)[0] #步骤二、用户登陆,携带上一次的cookie,后台对cookie中的 jsessionid 进行授权 r3 = session.post( url='https://passport.lagou.com/login/login.json', data={ 'isValidate': True, # 'username': '424662508@qq.com', # 'password': '4c4c83b3adf174b9c22af4a179dddb63', 'username':'18611453110', 'password':'bff642652c0c9e766b40e1a6f3305274', 'request_form_verifyCode': '', 'submit': '', }, headers={ 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest', "Referer": "https://passport.lagou.com/login/login.html", "Host": "passport.lagou.com", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", }, ) print(r3.text) # print(r3.headers) #步骤三:进行授权 r4 = session.get('https://passport.lagou.com/grantServiceTicket/grant.html', allow_redirects=False, headers={'Host': "passport.lagou.com",'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) # print(r4.headers) location=r4.headers['Location'] # print(location) #步骤四:请求重定向的地址,拿到最终的登录session r5= session.get(location, allow_redirects=True, headers={ 'Host': "www.lagou.com", 'Referer':'https://passport.lagou.com/login/login.html?', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) # print(r5.headers) #===============以上是登录环节 r6=session.get('https://www.lagou.com/resume/myresume.html') print('林海峰' in r6.text) print(r6.text) #拿到r6.text即个人主页内容,然后用re模块,想取啥就取啥了,这种low操作就不必说了
四 爬取并筛选职位信息
import requests,re session = requests.Session() #步骤一、首先登陆login.html,获取cookie r1 = session.get('https://passport.lagou.com/login/login.html', headers={'Host': "passport.lagou.com",'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) X_Anti_Forge_Token = re.findall(r"window.X_Anti_Forge_Token = '(.*)';",r1.text)[0] X_Anti_Forge_Code = re.findall(r"window.X_Anti_Forge_Code = '(.*)';",r1.text)[0] #步骤二、用户登陆,携带上一次的cookie,后台对cookie中的 jsessionid 进行授权 r3 = session.post( url='https://passport.lagou.com/login/login.json', data={ 'isValidate': True, # 'username': '424662508@qq.com', # 'password': '4c4c83b3adf174b9c22af4a179dddb63', 'username':'18611453110', 'password':'bff642652c0c9e766b40e1a6f3305274', 'request_form_verifyCode': '', 'submit': '', }, headers={ 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest', "Referer": "https://passport.lagou.com/login/login.html", "Host": "passport.lagou.com", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", }, ) print(r3.text) # print(r3.headers) #步骤三:进行授权 r4 = session.get('https://passport.lagou.com/grantServiceTicket/grant.html', allow_redirects=False, headers={'Host': "passport.lagou.com",'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) # print(r4.headers) location=r4.headers['Location'] # print(location) #步骤四:请求重定向的地址,拿到最终的登录session r5= session.get(location, allow_redirects=True, headers={ 'Host': "www.lagou.com", 'Referer':'https://passport.lagou.com/login/login.html?', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) # print(r5.headers) #===============以上是登录环节 #爬取职位信息 #步骤一:分析 #搜索职位的url样例:https://www.lagou.com/jobs/list_python%E5%BC%80%E5%8F%91?labelWords=&fromSearch=true&suginput= from urllib.parse import urlencode keyword='python开发' url_encode=urlencode({'k':keyword},encoding='utf-8') #k=python%E5%BC%80%E5%8F%91 url='https://www.lagou.com/jobs/list_%s?labelWords=&fromSearch=true&suginput=' %url_encode.split('=')[1] #根据用户的keyword拼接出搜索职位的url print(url) #拿到职位信息的主页面 r7=session.get(url, headers={ 'Host': "www.lagou.com", 'Referer': 'https://passport.lagou.com/login/login.html?', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }) #发现主页面中并没有我们想要搜索的职位信息,那么肯定是通过后期js渲染出的结果,一查,果然如此 r7.text #搜索职位:请求职位的url后只获取了一些静态内容,关于职位的信息是向https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0发送请求拿到json #步骤二:验证分析的结果 #爬取职位信息,发post请求,拿到json数据:'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0' r8=session.post('https://www.lagou.com/jobs/positionAjax.json', params={ 'needAddtionalResult':False, 'isSchoolJob':'0', }, headers={ 'Host': "www.lagou.com", 'Origin':'https://www.lagou.com', 'Referer': url, 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'X-Anit-Forge-Code':'0', 'X-Anit-Forge-Token': '', 'X-Requested-With': 'XMLHttpRequest', 'Accept':'application/json, text/javascript, */*; q=0.01' }, data={ 'first':True, 'pn':'1', 'kd':'python开发' } ) print(r8.json()) #pageNo:1 代表第一页,pageSize:15代表本页有15条职位记录,我们需要做的是获取总共有多少页就可以了 #步骤三(最终实现):实现根据传入参数,筛选职位信息 from urllib.parse import urlencode keyword='python开发' url_encode=urlencode({'k':keyword},encoding='utf-8') #k=python%E5%BC%80%E5%8F%91 url='https://www.lagou.com/jobs/list_%s?labelWords=&fromSearch=true&suginput=' %url_encode.split('=')[1] #根据用户的keyword拼接出搜索职位的url def search_position( keyword, pn=1, city='北京', district=None, bizArea=None, isSchoolJob=None, xl=None, jd=None, hy=None, yx=None, needAddtionalResult=False, px='detault'): params = { 'city': city, # 工作地点,如北京 'district': district, # 行政区,如朝阳区 'bizArea': bizArea, # 商区,如望京 'isSchoolJob': isSchoolJob, # 工作性质,如应届 'xl': xl, # 学历要求,如大专 'jd': jd, # 融资阶段,如天使轮,A轮 'hy': hy, # 行业领域,如移动互联网 'yx': yx, # 工资范围,如10-15k 'needAddtionalResult': needAddtionalResult, 'px': 'detault' }, r8 = session.post('https://www.lagou.com/jobs/positionAjax.json', params={ 'city': city, #工作地点,如北京 'district': district,#行政区,如朝阳区 'bizArea': bizArea, #商区,如望京 'isSchoolJob': isSchoolJob, #工作性质,如应届 'xl': xl, #学历要求,如大专 'jd': jd,#融资阶段,如天使轮,A轮 'hy': hy, #行业领域,如移动互联网 'yx': yx, #工资范围,如10-15k 'needAddtionalResult': needAddtionalResult, 'px':'detault' }, headers={ 'Host': "www.lagou.com", 'Origin': 'https://www.lagou.com', 'Referer': url, 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'X-Anit-Forge-Code': '0', 'X-Anit-Forge-Token': '', 'X-Requested-With': 'XMLHttpRequest', 'Accept': 'application/json, text/javascript, */*; q=0.01' }, data={ 'first': True, 'pn': pn, 'kd': keyword, } ) print(r8.status_code) print(r8.json()) return r8.json() #求一份北京朝阳区10-15k的python开发工作 keyword='python开发' yx='10k-15k' city='北京' district='朝阳区' isSchoolJob='0' #应届或实习 response=search_position(keyword=keyword,yx=yx,city=city,district=district,isSchoolJob=isSchoolJob) results=response['content']['positionResult']['result'] #打印公司的详细信息 def get_company_info(results): for res in results: info = ''' 公司全称 : %s 地址 : %s,%s 发布时间 : %s 职位名 : %s 职位类型 : %s,%s 工作模式 : %s 薪资 : %s 福利 : %s 要求工作经验 : %s 公司规模 : %s 详细链接 : https://www.lagou.com/jobs/%s.html ''' % ( res['companyFullName'], res['city'], res['district'], res['createTime'], res['positionName'], res['firstType'], res['secondType'], res['jobNature'], res['salary'], res['positionAdvantage'], res['workYear'], res['companySize'], res['positionId'] ) print(info) # 经分析,公司的详细链接都是:https://www.lagou.com/jobs/2653020.html ,其中那个编号就是职位id #print('公司全称[%s],简称[%s]' %(res['companyFullName'],res['companyShortName'])) get_company_info(results)
五 自动提交简历
import requests,re session = requests.Session() #步骤一、首先登陆login.html,获取cookie r1 = session.get('https://passport.lagou.com/login/login.html', headers={'Host': "passport.lagou.com",'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) X_Anti_Forge_Token = re.findall(r"window.X_Anti_Forge_Token = '(.*)';",r1.text)[0] X_Anti_Forge_Code = re.findall(r"window.X_Anti_Forge_Code = '(.*)';",r1.text)[0] #步骤二、用户登陆,携带上一次的cookie,后台对cookie中的 jsessionid 进行授权 r3 = session.post( url='https://passport.lagou.com/login/login.json', data={ 'isValidate': True, # 'username': '424662508@qq.com', # 'password': '4c4c83b3adf174b9c22af4a179dddb63', 'username':'18611453110', 'password':'bff642652c0c9e766b40e1a6f3305274', 'request_form_verifyCode': '', 'submit': '', }, headers={ 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest', "Referer": "https://passport.lagou.com/login/login.html", "Host": "passport.lagou.com", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", }, ) print(r3.text) # print(r3.headers) #步骤三:进行授权 r4 = session.get('https://passport.lagou.com/grantServiceTicket/grant.html', allow_redirects=False, headers={'Host': "passport.lagou.com",'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) # print(r4.headers) location=r4.headers['Location'] # print(location) #步骤四:请求重定向的地址,拿到最终的登录session r5= session.get(location, allow_redirects=True, headers={ 'Host': "www.lagou.com", 'Referer':'https://passport.lagou.com/login/login.html?', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}) # print(r5.headers) #===============以上是登录环节 #自动提交简历(data内的positionId即3476321.html的数字) #先访问主页面,拿到X_Anti_Forge_Tokenm,X_Anti_Forge_Code,userid r9 = session.get('https://www.lagou.com/jobs/3476321.html', headers={ 'Host': "www.lagou.com", 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }) X_Anti_Forge_Token = re.findall(r"window.X_Anti_Forge_Token = '(.*)';",r9.text)[0] X_Anti_Forge_Code = re.findall(r"window.X_Anti_Forge_Code = '(.*)';",r9.text)[0] userid=re.findall(r'value="(d+)" name="userid"',r9.text)[0] print(userid,type(userid)) with open('a.html','w',encoding='utf-8') as f : f.write(userid) #然后发送用户id与职位id,post提交即可 r10=session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json', headers={ 'Host': "www.lagou.com", 'Origin':'https://www.lagou.com', 'Referer':'https://www.lagou.com/jobs/3737624.html', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest', }, data={ 'userId':userid, 'positionId':'3476321', #即'positionId' 'force':False, 'type':'', 'resubmitToken':'' } ) print(r10.status_code) print(r10.text) #可以去投递箱内查看投递结果,地址为:https://www.lagou.com/mycenter/delivery.html