• 爬虫之拉勾网职位获取


    重点在于演示urllib.request.Request()请求中各项参数的 书写格式 譬如: url data headers...

    Demo演示(POST请求):

    import urllib.request
    import urllib.parse
    import json, jsonpath, csv

    url = "https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false"
    headers = {
        "Accept": "application/json, text/javascript, */*; q=0.单线程",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Connection": "keep-alive",
        "Content-Length": "38",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Cookie": "_ga=GA1.2.1963509933.1531996888; user_trace_token=20180719184127-4a8c7914-8b40-11e8-9eb6-525400f775ce; LGUID=20180719184127-4a8c7df2-8b40-11e8-9eb6-525400f775ce; JSESSIONID=ABAAABAAAIAACBI0F0B14254DA54E3CCF3B1F22FE32B179; _gid=GA1.2.1918046323.1536408617; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536408620; X_HTTP_TOKEN=339034308973d0bd323cc0b9b6b3203a; LG_LOGIN_USER_ID=24096d6ba723e146bd326de981ab924b23c1f21775136c3a8be953e855211e61; _putrc=95519B7FB60FCF58123F89F2B170EADC; login=true; unick=%E9%A9%AC%E7%BB%A7%E4%B8%9A; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=55; gate_login_token=3b4fa15daef090780ae377bbcd66dc83af9af0cc6a7f1dd697770790f3b9f9ef; index_location_city=%E4%B8%8A%E6%B5%B7; TG-TRACK-CODE=search_code; _gat=1; LGSID=20180908221639-cd6d2a72-b371-11e8-b62b-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E7%2588%25AC%25E8%2599%25AB%3Fcity%3D%25E4%25B8%258A%25E6%25B5%25B7%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; SEARCH_ID=e559a417b4464fd9bc0b439a67ef0a5a; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536416580; LGRID=20180908222259-afed9b74-b372-11e8-b62b-5254005c3644",
        "Host": "www.lagou.com",
        "Origin": "https://www.lagou.com",
        "Referer": "https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
        "X-Anit-Forge-Code": "0",
        "X-Anit-Forge-Token": "None",
        "X-Requested-With": "XMLHttpRequest"}
    # params = {"city": "上海", "needAddtionalResult": "false"}
    list_position = []
    for pn in range(1, 5):
        data = {
            "first": "false",
            "pn": pn,
            "kd": "爬虫"
        }
        # params = urllib.parse.urlencode(params)
        # url = url + params
        data = urllib.parse.urlencode(data).encode('utf-8')
        req = urllib.request.Request(url, data=data, headers=headers)
        print('正在请求第%d页' % pn)
        str_data = urllib.request.urlopen(req).read()
        with open('03.html', 'wb') as f:
            f.write(str_data)
        # 转换成python对象
        data_list = json.loads(str_data)
        job_list = jsonpath.jsonpath(data_list, "$..result")[0]

        for item in job_list:
            position_dict = {}
            position_dict['positionName'] = item.get('positionName')
            position_dict['createTime'] = item.get('createTime')
            position_dict['url'] = 'https://www.lagou.com/jobs/' + str(item.get('positionId')) + '.html'

            position_dict['salary'] = item.get('salary')
            position_dict['workYear'] = item.get('workYear')
            position_dict['companySize'] = item.get('companySize')
            list_position.append(position_dict)

    # 保存到json文件
    json.dump(list_position, open('03.json', 'w'))

    # 保存到csv文件  'gbk' codec can't encode character 'u200b' in position 0: illegal multibyte seq
    csv_writer = csv.writer(open('04.csv', 'w', encoding='utf-8'))
    sheets = list_position[0].keys()  # 表头
    row_content = []
    for item in list_position:
        row_content.append(item.values())  # 内容
    try:
        csv_writer.writerow(sheets)
        csv_writer.writerows(row_content)
    except Exception as e:
        print(e)


     1 import urllib.request
     2 import urllib.parse
     3 import json, jsonpath, csv
     4 
     5 url = "https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false"
     6 headers = {
     7     "Accept": "application/json, text/javascript, */*; q=0.单线程",
     8     "Accept-Encoding": "gzip, deflate, br",
     9     "Accept-Language": "zh-CN,zh;q=0.9",
    10     "Connection": "keep-alive",
    11     "Content-Length": "38",
    12     "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    13     "Cookie": "_ga=GA1.2.1963509933.1531996888; user_trace_token=20180719184127-4a8c7914-8b40-11e8-9eb6-525400f775ce; LGUID=20180719184127-4a8c7df2-8b40-11e8-9eb6-525400f775ce; JSESSIONID=ABAAABAAAIAACBI0F0B14254DA54E3CCF3B1F22FE32B179; _gid=GA1.2.1918046323.1536408617; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536408620; X_HTTP_TOKEN=339034308973d0bd323cc0b9b6b3203a; LG_LOGIN_USER_ID=24096d6ba723e146bd326de981ab924b23c1f21775136c3a8be953e855211e61; _putrc=95519B7FB60FCF58123F89F2B170EADC; login=true; unick=%E9%A9%AC%E7%BB%A7%E4%B8%9A; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=55; gate_login_token=3b4fa15daef090780ae377bbcd66dc83af9af0cc6a7f1dd697770790f3b9f9ef; index_location_city=%E4%B8%8A%E6%B5%B7; TG-TRACK-CODE=search_code; _gat=1; LGSID=20180908221639-cd6d2a72-b371-11e8-b62b-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E7%2588%25AC%25E8%2599%25AB%3Fcity%3D%25E4%25B8%258A%25E6%25B5%25B7%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; SEARCH_ID=e559a417b4464fd9bc0b439a67ef0a5a; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536416580; LGRID=20180908222259-afed9b74-b372-11e8-b62b-5254005c3644",
    14     "Host": "www.lagou.com",
    15     "Origin": "https://www.lagou.com",
    16     "Referer": "https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=",
    17     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    18     "X-Anit-Forge-Code": "0",
    19     "X-Anit-Forge-Token": "None",
    20     "X-Requested-With": "XMLHttpRequest"}
    21 # params = {"city": "上海", "needAddtionalResult": "false"}
    22 list_position = []
    23 for pn in range(1, 5):
    24     data = {
    25         "first": "false",
    26         "pn": pn,
    27         "kd": "爬虫"
    28     }
    29     # params = urllib.parse.urlencode(params)
    30     # url = url + params
    31     data = urllib.parse.urlencode(data).encode('utf-8')
    32     req = urllib.request.Request(url, data=data, headers=headers)
    33     print('正在请求第%d页' % pn)
    34     str_data = urllib.request.urlopen(req).read()
    35     with open('03.html', 'wb') as f:
    36         f.write(str_data)
    37     # 转换成python对象
    38     data_list = json.loads(str_data)
    39     job_list = jsonpath.jsonpath(data_list, "$..result")[0]
    40 
    41     for item in job_list:
    42         position_dict = {}
    43         position_dict['positionName'] = item.get('positionName')
    44         position_dict['createTime'] = item.get('createTime')
    45         position_dict['url'] = 'https://www.lagou.com/jobs/' + str(item.get('positionId')) + '.html'
    46 
    47         position_dict['salary'] = item.get('salary')
    48         position_dict['workYear'] = item.get('workYear')
    49         position_dict['companySize'] = item.get('companySize')
    50         list_position.append(position_dict)
    51 
    52 # 保存到json文件
    53 json.dump(list_position, open('03.json', 'w'))
    54 
    55 # 保存到csv文件  'gbk' codec can't encode character 'u200b' in position 0: illegal multibyte seq
    56 csv_writer = csv.writer(open('04.csv', 'w', encoding='utf-8'))
    57 sheets = list_position[0].keys()  # 表头
    58 row_content = []
    59 for item in list_position:
    60     row_content.append(item.values())  # 内容
    61 try:
    62     csv_writer.writerow(sheets)
    63     csv_writer.writerows(row_content)
    64 except Excepti
  • 相关阅读:
    Caused by: org.xml.sax.SAXParseException: The content of elements must consist of well-formed charac
    java反射方法
    错误解决:There is no getter for property named 'id' in class 'java.lang.String'
    java的AOP
    Java预置的注解
    mysql 数据库自动备份
    微信小程序
    关于 IOC和spring基本配置详解
    关于Spring配置的一些东西
    关于Spring的一点东西
  • 原文地址:https://www.cnblogs.com/We612/p/9978288.html
Copyright © 2020-2023  润新知