思路:
【声明,少量爬取公开数据仅供分析以及爬虫学习使用】
1.确定起始URL:https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html
2.观察网页内容:
3.寻找字段
4.完整代码
注释代码里面,很好理解
# -*- coding: utf-8 -*- # @Time : 2019/6/5 18:45 # @Author : baby # @File : get_51.py import requests from lxml import etree import pandas as pd import logging class Job: def __init__(self): self.headers = {'User-Agent':'换成你自己的'} def get_URL(self): logging.captureWarnings(True) file_List = [] for i in range(1,4): #215 start_urls = 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598,2,{}.html'.format(i) response = requests.get(start_urls, headers=self.headers, timeout=10,verify=False) response.encoding = 'gbk' if response.status_code == 200: print("主页start_urls爬取成功,等待解析:",start_urls) HTML = etree.HTML(response.text) job_URL = HTML.xpath('//div[@id="resultList"]/div[@class="el"]/p/span/a/@href') for url in job_URL: html = requests.get(url, headers=self.headers,verify=False,timeout=5) if html.status_code == 200: print("爬取当前岗位成功:", url) html.encoding = 'gbk' job_html = etree.HTML(html.text) #--- if job_html.xpath('//div[3]//div[@class="cn"]/h1/@title'): jobName = str(job_html.xpath('//div[3]//div[@class="cn"]/h1/@title')[0]) # 工作岗位 else: jobName = 'nan' #--- if job_html.xpath('//div[@class="cn"]/strong/text()'): jobSalary = str(job_html.xpath('//div[@class="cn"]/strong/text()')[0]) # 薪水 else: jobSalary = 'nan' #--- if job_html.xpath('//div[@class="cn"]/p[@class="msg ltype"]/@title'): job_item = job_html.xpath('//div[@class="cn"]/p[@class="msg ltype"]/@title')[ 0] # pattern = re.compile(r'(S+)s*|') conten_List = str(job_item).split("xa0xa0|xa0xa0") jobPlace = str(conten_List[0]) # 工作地点 jobExperience = str(conten_List[1]) # 工作经验 jobEducation = str(conten_List[2]) # 教育要求 jobNumber = str(conten_List[3]) # 招收人数 else: jobPlace = 'nan' # 工作地点 jobExperience = 'nan' # 工作经验 jobEducation = 'nan' # 教育要求 jobNumber = 'nan' # 招收人数 #--- if job_html.xpath('//div[@class="bmsg job_msg inbox"]/p/text()'): job_Imformation_List = job_html.xpath('//div[@class="bmsg job_msg inbox"]/p/text()') jobSkills = '' for i in range(0, len(job_Imformation_List)): jobSkills = jobSkills + str(job_Imformation_List[i]) + ' ' # 工作技能要求 else: jobSkills = 'nan' #--- file_List.append([jobName,jobSalary,jobPlace,jobExperience,jobEducation,jobNumber,jobSkills]) # yield file_List else: # print("当前页爬取失败进入下一页") pass return file_List def save_File(self): self.itemName = ['职位名','薪资','工作地点','工作经验','学历','招牌人数','招牌条件'] file_List = Job.get_URL(self) df = pd.DataFrame(file_List) df.to_excel('data1.xlsx',header=self.itemName) print("文件保存完成!") if __name__ == '__main__': j = Job() j.save_File()