• 爬取全程无忧岗位


    思路:
    【声明,少量爬取公开数据仅供分析以及爬虫学习使用】

    1.确定起始URL:https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html
    2.观察网页内容:
    在这里插入图片描述在这里插入图片描述
    3.寻找字段在这里插入图片描述在这里插入图片描述
    4.完整代码
    注释代码里面,很好理解

    # -*- coding: utf-8 -*-
    # @Time    : 2019/6/5 18:45
    # @Author  : baby
    # @File    : get_51.py
    import requests
    from lxml import etree
    import pandas as pd
    import logging
    
    
    
    class Job:
        def __init__(self):
            self.headers = {'User-Agent':'换成你自己的'}
        def get_URL(self):
            logging.captureWarnings(True)
            file_List = []
            for i in range(1,4):  #215
                start_urls = 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598,2,{}.html'.format(i)
                response = requests.get(start_urls, headers=self.headers, timeout=10,verify=False)
                response.encoding = 'gbk'
                if response.status_code == 200:
                    print("主页start_urls爬取成功,等待解析:",start_urls)
                    HTML = etree.HTML(response.text)
                    job_URL = HTML.xpath('//div[@id="resultList"]/div[@class="el"]/p/span/a/@href')
                    for url in job_URL:
                        html = requests.get(url, headers=self.headers,verify=False,timeout=5)
                        if html.status_code == 200:
                            print("爬取当前岗位成功:", url)
                        html.encoding = 'gbk'
                        job_html = etree.HTML(html.text)
                        #---
                        if job_html.xpath('//div[3]//div[@class="cn"]/h1/@title'):
                            jobName = str(job_html.xpath('//div[3]//div[@class="cn"]/h1/@title')[0])  # 工作岗位
                        else:
                            jobName = 'nan'
                        #---
                        if job_html.xpath('//div[@class="cn"]/strong/text()'):
                            jobSalary = str(job_html.xpath('//div[@class="cn"]/strong/text()')[0])  # 薪水
    
                        else:
                            jobSalary = 'nan'
                        #---
                        if job_html.xpath('//div[@class="cn"]/p[@class="msg ltype"]/@title'):
                            job_item = job_html.xpath('//div[@class="cn"]/p[@class="msg ltype"]/@title')[
                                0]  # pattern = re.compile(r'(S+)s*|')
                            conten_List = str(job_item).split("xa0xa0|xa0xa0")
                            jobPlace = str(conten_List[0])  # 工作地点
                            jobExperience = str(conten_List[1])  # 工作经验
                            jobEducation = str(conten_List[2])  # 教育要求
                            jobNumber = str(conten_List[3])  # 招收人数
                        else:
                            jobPlace = 'nan'  # 工作地点
                            jobExperience = 'nan'  # 工作经验
                            jobEducation = 'nan'  # 教育要求
                            jobNumber = 'nan'  # 招收人数
                        #---
                        if job_html.xpath('//div[@class="bmsg job_msg inbox"]/p/text()'):
                            job_Imformation_List = job_html.xpath('//div[@class="bmsg job_msg inbox"]/p/text()')
                            jobSkills = ''
                            for i in range(0, len(job_Imformation_List)):
                                jobSkills = jobSkills + str(job_Imformation_List[i]) + '
    '  # 工作技能要求
                        else:
                            jobSkills = 'nan'
                        #---
                        file_List.append([jobName,jobSalary,jobPlace,jobExperience,jobEducation,jobNumber,jobSkills])
                        # yield file_List
    
                else:
                    # print("当前页爬取失败进入下一页")
                    pass
            return  file_List
    
        def save_File(self):
            self.itemName = ['职位名','薪资','工作地点','工作经验','学历','招牌人数','招牌条件']
            file_List = Job.get_URL(self)
            df = pd.DataFrame(file_List)
            df.to_excel('data1.xlsx',header=self.itemName)
            print("文件保存完成!")
    
    if __name__ == '__main__':
        j = Job()
        j.save_File()
  • 相关阅读:
    jQuery 基础一 样式篇
    javaJavaScript DOM
    linux 实用命令
    Linux下修改.bash_profile 文件改变PATH变量的值
    java 字符串截取的方法
    Telnet命令参考手册
    linux下dubbo调试 ---telnet命令
    【Spring Task】定时任务详解实例-@Scheduled
    Spring定时任务的几种实现
    SQL之case when then用法
  • 原文地址:https://www.cnblogs.com/chenruhai/p/12464220.html
Copyright © 2020-2023  润新知