• python使用bs4爬取boss静态页面


    思路:

      1、将需要查询城市列表,通过城市接口转换成相应的code码

      2、遍历城市、职位生成url

      3、通过url获取列表页面信息,遍历列表页面信息

      4、再根据列表页面信息的job_link获取详情页面信息,将需要的信息以字典data的形式存在列表datas里  

      5、判断列表页面是否有下一页,重复步骤3、4;同时将列表datas一直传递下去

      6、一个城市、职位url爬取完后,将列表datas接在列表datas_list后面,重复3、4、5

      7、最后将列表datas_list的数据,遍历写在Excel里面

    知识点:

      1、将response内容以json形式输出,解析json并取值

      2、soup 的select()和find_all()和find()方法使用

      3、异常Exception的使用

      4、wldt创建编辑Excel的使用

    import requests, time, xlwt
    from bs4 import BeautifulSoup
    
    class MyJob():
        def __init__(self, mycity, myquery):
            self.city = mycity
            self.query = myquery
            self.list_url = "https://www.zhipin.com/job_detail/?query=%s&city=%s&industry=&position="%(self.query, self.city)
            self.datas = []
            self.header = {
                'authority': 'www.zhipin.com',
                'method': 'GET',
                'scheme': 'https',
                'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'accept-encoding': 'gzip, deflate, br',
                'accept-language': 'zh-CN,zh;q=0.9',
                'cache-control': 'max-age=0',
                'cookie': 'lastCity=101210100;uab_collina=154408714637849548916323;toUrl=/;c=1558272251;g=-;l=l=%2Fwww.zhipin.com%2Fuser%2Flogin.html&r=; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1555852331,1556985726,1558169427,1558272251; __a=40505844.1544087205.1558169426.1558272251.41.14.4.31; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1558272385',
                'referer': 'https://www.zhipin.com/?ka=header-logo',
                'upgrade-insecure-requests': '1',
                'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
            }
    
        #将城市转化为code码
        def get_city(self,city_list):
            city_url = "https://www.zhipin.com/wapi/zpCommon/data/city.json" #获取城市
            json = requests.get(city_url).json()
            zpData = json["zpData"]["cityList"]
            list = []
            for city in city_list :
                for data_sf in zpData:
                    for data_dq in data_sf["subLevelModelList"]:
                        if city == data_dq["name"]:
                             list.append(data_dq["code"])
            return list
    
        #获取所有页内容
        def get_job_list(self, url, datas):
            print(url)
            html = requests.get(url, headers=self.header).text
            soup = BeautifulSoup(html, 'html.parser')
            jobs = soup.select(".job-primary")
            for job in jobs:
                data = {}
                # 招聘id
                data["job_id"] = job.find_all("div", attrs={"class": "info-primary"})[0].find("a").get("data-jobid")
                # 招聘链接
                data["job_link"] = "https://www.zhipin.com" + job.find_all("div", attrs={"class": "info-primary"})[0].find("a").get("href")
                # 招聘岗位
                data["job_name"] = job.find_all("div", attrs={"class": "info-primary"})[0].find("div", attrs={"class": "job-title"}).get_text()
                # 薪资
                data["job_red"] = job.find_all("div", attrs={"class": "info-primary"})[0].find("span", attrs={"class": "red"}).get_text()
                # 地址 #工作年限 #学历
                data["job_address"] = job.find_all("div", attrs={"class": "info-primary"})[0].find("p").get_text().split(" ")
                # 企业链接
                data["job_company_link"] = job.find_all("div", attrs={"class": "info-company"})[0].find("a").get("href")
                # 企业信息
                data["job_company"] = job.find_all("div", attrs={"class": "info-company"})[0].find("p").get_text().split(" ")
                # boss链接
                data["job_publis_link"] = job.find_all("div", attrs={"class": "info-publis"})[0].find("img").get("src")
                # boos信息
                data["job_publis"] = job.find_all("div", attrs={"class": "info-publis"})[0].find("h3").get_text().split(" ")
                time.sleep(5)
                self.get_job_detail(data)  # 获取job详情页内容
                print(data)
                datas.append(data)  # 将某条job添加到datas中,直到将当前页添加完
    
            try:
                next_url = soup.find("div", attrs={"class": "page"}).find("a", attrs={"class": "next"}).get("href")
                #if next_url[-1] =="3":  # 第二页自动抛异常
                if next_url in "javascript:;":  # 最后一页自动抛异常
                    raise Exception()
            except Exception as e:
                print("最后一页了;%s" % e)
                return datas  # 返回所有页内容
            else:
                time.sleep(5)
                next_url = "https://www.zhipin.com" + next_url
                self.get_job_list(next_url, datas)
                return datas  # 返回所有页内容
    
        #获取详情页内容
        def get_job_detail(self, data):
            print(data["job_link"])
            html = requests.get(data["job_link"], headers=self.header).text
            soup = BeautifulSoup(html, 'html.parser')
            # 招聘公司
            data["detail_content_name"] = soup.find_all("div", attrs={"class": "detail-content"})[0].find("div", attrs={"class": "name"}).get_text()
            # 福利
            data["detail_primary_tags"] = soup.find_all("div", attrs={"class": "info-primary"})[0].find("div", attrs={"class": "job-tags"}).get_text().strip()
            # 招聘岗位
            data["detail_primary_name"] = soup.find_all("div", attrs={"class": "info-primary"})[0].find("h1").get_text()
            # 招聘状态
            data["detail_primary_status"] = soup.find_all("div", attrs={"class": "info-primary"})[0].find("div", attrs={"class": "job-status"}).get_text()
            # 薪资
            data["detail_primary_salary"] = soup.find_all("div", attrs={"class": "info-primary"})[0].find("span", attrs={"class": "salary"}).get_text()
            # 地址 #工作年限 #学历
            data["detail_primary_address"] = soup.find_all("div", attrs={"class": "info-primary"})[0].find("p").get_text()
            # 工作地址
            data["detail_content_address"] = soup.find_all("div", attrs={"class": "detail-content"})[0].find("div", attrs={"class": "location-address"}).get_text()
            # 职位描述
            data["detail_content_text"] = soup.find_all("div", attrs={"class": "detail-content"})[0].find("div", attrs={"class": "text"}).get_text().strip().replace("", "
    ")
            # boss名字
            data["detail_op_name"] = soup.find_all("div", attrs={"class": "detail-op"})[1].find("h2", attrs={"class": "name"}).get_text()
            # boss职位
            data["detail_op_job"] = soup.find_all("div", attrs={"class": "detail-op"})[1].find("p", attrs={"class": "gray"}).get_text().split("·")[0]
            # boss状态
            data["detail_op_status"] = soup.find_all("div", attrs={"class": "detail-op"})[1].find("p", attrs={"class": "gray"}).get_text().split("·")[1]
    
        #将获取的数据写入Excel
        def setExcel(self, datas_list):
            book = xlwt.Workbook(encoding='utf-8')
            table = book.add_sheet("boss软件测试")
            table.write(0, 0, "编号")
            table.write(0, 1, "招聘链接")
            table.write(0, 2, "招聘岗位")
            table.write(0, 3, "薪资")
            table.write(0, 4, "地址")
            table.write(0, 5, "企业链接")
            table.write(0, 6, "企业信息")
            table.write(0, 7, "boss链接")
            table.write(0, 8, "boss信息")
            table.write(0, 9, "detail详情")
            i = 1
            for data in datas_list:
                table.write(i, 0, data["job_id"])
                table.write(i, 1, data["job_link"])
                table.write(i, 2, data["job_name"])
                table.write(i, 3, data["job_red"])
                table.write(i, 4, data["job_address"])
                table.write(i, 5, data["job_company_link"])
                table.write(i, 6, data["job_company"])
                table.write(i, 7, data["job_publis_link"])
                table.write(i, 8, data["job_publis"])
    
                table.write(i, 10, data["detail_content_name"])
                table.write(i, 11, data["detail_primary_name"])
                table.write(i, 12, data["detail_primary_status"])
                table.write(i, 13, data["detail_primary_salary"])
                table.write(i, 14, data["detail_primary_address"])
                table.write(i, 15, data["detail_content_text"])
                table.write(i, 16, data["detail_op_name"])
                table.write(i, 17, data["detail_op_job"])
                table.write(i, 18, data["detail_op_status"])
                table.write(i, 19, data["detail_primary_tags"])
                table.write(i, 20, data["detail_content_address"])
                i += 1
            book.save(r'C:\%s_boss软件测试.xls' % time.strftime('%Y%m%d%H%M%S'))
            print("Excel保存成功")
    
    if __name__ == '__main__':
        city_list = MyJob("","").get_city(["杭州"])
        query_list = ["软件测试", "测试工程师"]
        datas_list = []
        for city in city_list:
            for query in query_list:
                myjob = MyJob(city, query)
                datas = myjob.get_job_list(myjob.list_url, myjob.datas)
                datas_list.extend(datas)
        myjob.setExcel(datas_list)
  • 相关阅读:
    变量未定义
    国产银河麒麟Kylin V10操作系统-如何将应用程序固定到任务栏方便启动
    国产银河麒麟Kylin V10操作系统-如何配置Win+E快捷键打开“我的电脑”
    国产银河麒麟Kylin V10操作系统
    微信小程序中的细节
    vue + element初始化项目--代码初始化
    vscode, eslint, prettier, vetur冲突及解决
    VS Code 中 Vetur、prettier、ESLint 使用
    方法合集
    MedicalViewer 事件成员
  • 原文地址:https://www.cnblogs.com/shuzf/p/10934198.html
Copyright © 2020-2023  润新知