重点在于演示urllib.request.Request()请求中各项参数的 书写格式 譬如: url data headers...
Demo演示(POST请求):
import urllib.request
import urllib.parse
import json, jsonpath, csv
url = "https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false"
headers = {
"Accept": "application/json, text/javascript, */*; q=0.单线程",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Content-Length": "38",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie": "_ga=GA1.2.1963509933.1531996888; user_trace_token=20180719184127-4a8c7914-8b40-11e8-9eb6-525400f775ce; LGUID=20180719184127-4a8c7df2-8b40-11e8-9eb6-525400f775ce; JSESSIONID=ABAAABAAAIAACBI0F0B14254DA54E3CCF3B1F22FE32B179; _gid=GA1.2.1918046323.1536408617; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536408620; X_HTTP_TOKEN=339034308973d0bd323cc0b9b6b3203a; LG_LOGIN_USER_ID=24096d6ba723e146bd326de981ab924b23c1f21775136c3a8be953e855211e61; _putrc=95519B7FB60FCF58123F89F2B170EADC; login=true; unick=%E9%A9%AC%E7%BB%A7%E4%B8%9A; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=55; gate_login_token=3b4fa15daef090780ae377bbcd66dc83af9af0cc6a7f1dd697770790f3b9f9ef; index_location_city=%E4%B8%8A%E6%B5%B7; TG-TRACK-CODE=search_code; _gat=1; LGSID=20180908221639-cd6d2a72-b371-11e8-b62b-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E7%2588%25AC%25E8%2599%25AB%3Fcity%3D%25E4%25B8%258A%25E6%25B5%25B7%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; SEARCH_ID=e559a417b4464fd9bc0b439a67ef0a5a; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536416580; LGRID=20180908222259-afed9b74-b372-11e8-b62b-5254005c3644",
"Host": "www.lagou.com",
"Origin": "https://www.lagou.com",
"Referer": "https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"X-Anit-Forge-Code": "0",
"X-Anit-Forge-Token": "None",
"X-Requested-With": "XMLHttpRequest"}
# params = {"city": "上海", "needAddtionalResult": "false"}
list_position = []
for pn in range(1, 5):
data = {
"first": "false",
"pn": pn,
"kd": "爬虫"
}
# params = urllib.parse.urlencode(params)
# url = url + params
data = urllib.parse.urlencode(data).encode('utf-8')
req = urllib.request.Request(url, data=data, headers=headers)
print('正在请求第%d页' % pn)
str_data = urllib.request.urlopen(req).read()
with open('03.html', 'wb') as f:
f.write(str_data)
# 转换成python对象
data_list = json.loads(str_data)
job_list = jsonpath.jsonpath(data_list, "$..result")[0]
for item in job_list:
position_dict = {}
position_dict['positionName'] = item.get('positionName')
position_dict['createTime'] = item.get('createTime')
position_dict['url'] = 'https://www.lagou.com/jobs/' + str(item.get('positionId')) + '.html'
position_dict['salary'] = item.get('salary')
position_dict['workYear'] = item.get('workYear')
position_dict['companySize'] = item.get('companySize')
list_position.append(position_dict)
# 保存到json文件
json.dump(list_position, open('03.json', 'w'))
# 保存到csv文件 'gbk' codec can't encode character 'u200b' in position 0: illegal multibyte seq
csv_writer = csv.writer(open('04.csv', 'w', encoding='utf-8'))
sheets = list_position[0].keys() # 表头
row_content = []
for item in list_position:
row_content.append(item.values()) # 内容
try:
csv_writer.writerow(sheets)
csv_writer.writerows(row_content)
except Exception as e:
print(e)
1 import urllib.request 2 import urllib.parse 3 import json, jsonpath, csv 4 5 url = "https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false" 6 headers = { 7 "Accept": "application/json, text/javascript, */*; q=0.单线程", 8 "Accept-Encoding": "gzip, deflate, br", 9 "Accept-Language": "zh-CN,zh;q=0.9", 10 "Connection": "keep-alive", 11 "Content-Length": "38", 12 "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", 13 "Cookie": "_ga=GA1.2.1963509933.1531996888; user_trace_token=20180719184127-4a8c7914-8b40-11e8-9eb6-525400f775ce; LGUID=20180719184127-4a8c7df2-8b40-11e8-9eb6-525400f775ce; JSESSIONID=ABAAABAAAIAACBI0F0B14254DA54E3CCF3B1F22FE32B179; _gid=GA1.2.1918046323.1536408617; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536408620; X_HTTP_TOKEN=339034308973d0bd323cc0b9b6b3203a; LG_LOGIN_USER_ID=24096d6ba723e146bd326de981ab924b23c1f21775136c3a8be953e855211e61; _putrc=95519B7FB60FCF58123F89F2B170EADC; login=true; unick=%E9%A9%AC%E7%BB%A7%E4%B8%9A; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=55; gate_login_token=3b4fa15daef090780ae377bbcd66dc83af9af0cc6a7f1dd697770790f3b9f9ef; index_location_city=%E4%B8%8A%E6%B5%B7; TG-TRACK-CODE=search_code; _gat=1; LGSID=20180908221639-cd6d2a72-b371-11e8-b62b-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E7%2588%25AC%25E8%2599%25AB%3Fcity%3D%25E4%25B8%258A%25E6%25B5%25B7%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; SEARCH_ID=e559a417b4464fd9bc0b439a67ef0a5a; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536416580; LGRID=20180908222259-afed9b74-b372-11e8-b62b-5254005c3644", 14 "Host": "www.lagou.com", 15 "Origin": "https://www.lagou.com", 16 "Referer": "https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=", 17 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", 18 "X-Anit-Forge-Code": "0", 19 "X-Anit-Forge-Token": "None", 20 "X-Requested-With": "XMLHttpRequest"} 21 # params = {"city": "上海", "needAddtionalResult": "false"} 22 list_position = [] 23 for pn in range(1, 5): 24 data = { 25 "first": "false", 26 "pn": pn, 27 "kd": "爬虫" 28 } 29 # params = urllib.parse.urlencode(params) 30 # url = url + params 31 data = urllib.parse.urlencode(data).encode('utf-8') 32 req = urllib.request.Request(url, data=data, headers=headers) 33 print('正在请求第%d页' % pn) 34 str_data = urllib.request.urlopen(req).read() 35 with open('03.html', 'wb') as f: 36 f.write(str_data) 37 # 转换成python对象 38 data_list = json.loads(str_data) 39 job_list = jsonpath.jsonpath(data_list, "$..result")[0] 40 41 for item in job_list: 42 position_dict = {} 43 position_dict['positionName'] = item.get('positionName') 44 position_dict['createTime'] = item.get('createTime') 45 position_dict['url'] = 'https://www.lagou.com/jobs/' + str(item.get('positionId')) + '.html' 46 47 position_dict['salary'] = item.get('salary') 48 position_dict['workYear'] = item.get('workYear') 49 position_dict['companySize'] = item.get('companySize') 50 list_position.append(position_dict) 51 52 # 保存到json文件 53 json.dump(list_position, open('03.json', 'w')) 54 55 # 保存到csv文件 'gbk' codec can't encode character 'u200b' in position 0: illegal multibyte seq 56 csv_writer = csv.writer(open('04.csv', 'w', encoding='utf-8')) 57 sheets = list_position[0].keys() # 表头 58 row_content = [] 59 for item in list_position: 60 row_content.append(item.values()) # 内容 61 try: 62 csv_writer.writerow(sheets) 63 csv_writer.writerows(row_content) 64 except Excepti