1. 前程无忧招聘信息爬虫
爬取这个网站: http://www.51job.com
(设置选项后)分析链接得:https://search.51job.com/jobsearch/search_result.php
# -*- coding: utf-8 -*- import requests import re from parsel import Selector key = "java" url = "https://search.51job.com/jobsearch/search_result.php" data = {"fromJs" : "1", "jobarea" : "020000", "keyword" : key, "keywordtype" : "2", "curr_page" : "1", } hd = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"} response = requests.get(url, params=data, headers=hd) print(response.request.url) # 解决编码问题 data = bytes(response.text, response.encoding).decode("gbk", "ignore") pat_page = "共(.*?)条职位" allline = re.compile(pat_page, re.S).findall(data)[0] print(allline) # allline条数据 allpage = int(allline) // 50 + 1 # 每页50条数据 for i in range(0, allpage): print("----正在爬" + str(i + 1) + "页-------") getdata = {"fromJs" : "1", "jobarea" : "020000", "keyword" : key, "keywordtype" : "2", "curr_page" : str(i+1), } response = requests.get(url, params=getdata, headers=hd) # 解决编码问题 thisdata = bytes(response.text, response.encoding).decode("gbk", "ignore") # print(response.request.url) sel = Selector(thisdata) job_url_all = sel.xpath("//p[@class='t1 ']/span/a/@href").getall() # 遍历当前页上所有工作链接 for job_url in job_url_all: # print(job_url) thisurl = job_url response = requests.get(thisurl) thisdata = bytes(response.text, response.encoding).decode("gbk", "ignore") pat_title = '<h1 title="(.*?)"' pat_company = '<p class="cname">.*?title="(.*?)"' pat_money = '<div class="tHeader tHjob">.*?<strong>(.*?)</strong>' pat_msg = '<div class="bmsg job_msg inbox">(.*?)<div class="share">' title = re.compile(pat_title, re.S).findall(thisdata)[0] company = re.compile(pat_company, re.S).findall(thisdata)[0] money = re.compile(pat_money, re.S).findall(thisdata)[0] msg = re.compile(pat_msg, re.S).findall(thisdata)[0] print('-----------------------------------') print("岗位: ", title) print("公司: ", company) print("薪资: ", money) print("岗位要求: ", msg)