• python 抓取拉勾网 攻略


    废话不多说,直接上代码,将数据存入Mongdb

    import requests
    import pymongo
    import time
    import random
    
    mycon = pymongo.MongoClient('127.0.0.1',27017)  # 建立连接
    mydb = mycon['lagou_data']                      # 设置库名
    
    
    class LaGouSpider():
        def __init__(self,city,kd):
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
                'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
            }
            self.city = city
            self.max_pn =  1
            self.kd =  kd
    
    
        def get_start(self):
            mycol = mydb[self.kd]  # 设置集合名
            url = "https://www.lagou.com/jobs/positionAjax.json?city="+ self.city +"&needAddtionalResult=false"
            for page in range(1,10):
                data = {
                    'first': 'true',
                    'pn': page,
                    'kd': self.kd
                }
                s = requests.Session()
                s.get(url = "https://www.lagou.com/jobs/list_python%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88?labelWords=&fromSearch=true&suginput=",headers = self.headers)
                cookies = s.cookies
                response = s.post(url=url,data=data,cookies = cookies,headers = self.headers).json()
                content = response.get('content')
                if content:
                    result = content['positionResult']['result']
                    print('岗位名称:{},所在城市:{},开始抓取第:{}页
    '.format(self.kd,self.city,page))
                    for i in result:
                        lagou_data = {}
                        lagou_data['positionName'] = i['positionName']        # 岗位名称
                        lagou_data['companyFullName'] = i['companyFullName']  # 公司全名
                        lagou_data['workYear'] = i['workYear']                # 工作经验要求
                        lagou_data['education'] = i['education']              # 学历要求
                        lagou_data['jobNature'] = i['jobNature']              # 工作性质
                        lagou_data['salary'] = i['salary']                    # 薪资
                        lagou_data['city'] = i['city']                        # 所在城市
                        lagou_data['financeStage'] = i['financeStage']        # 金融阶段
                        lagou_data['industryField'] = i['industryField']      # 经营范围
                        lagou_data['companyShortName'] = i['companyShortName']# 公司简名
                        lagou_data['positionAdvantage'] = i['positionAdvantage']# 岗位优势
                        lagou_data['companySize'] = i['companySize']          # 公司规模
                        lagou_data['companyLabelList'] = i['companyLabelList']# 岗位待遇标签
                        lagou_data['district'] = i['district']                # 所在区域
                        lagou_data['positionLables'] = i['positionLables']    # 技术范围标签
                        lagou_data['firstType'] = i['firstType']              # 岗位类型
                        lagou_data['createTime'] = i['createTime']            # 发布时间
                        print(lagou_data)
                        mycol.insert(lagou_data)
                time.sleep(random.uniform(3,7))                               # 随机休眠
    
    
    
    if __name__ == '__main__':
        lagou = LaGouSpider('北京','python')
        lagou.get_start()

    简述:拉勾网反爬一般,也就是先获取该搜索页面中的 cookies信息,然后添加到返回的json数据接口中。

  • 相关阅读:
    多线程与Socket编程
    正则表达式
    委托事件泛型
    C#基础加强
    随笔
    不设置JAVA_HOME运行eclipse
    CentOS7.x系统中使用Docker时,在存储方面需要注意的问题
    【转】关于高可用负载均衡的探索-基于Rancher和Traefic
    Rancher 容器管理平台-免费视频培训-链接及内容-第三季
    使用Rancher的RKE快速部署Kubernetes集群
  • 原文地址:https://www.cnblogs.com/lvye001/p/11307740.html
Copyright © 2020-2023  润新知