一、需求
获取指定品牌的所有车型配置信息,并保存到excel中。
流程大致思路:
1.获取品牌id:brand_id
2.通过品牌id获取车型id:series_id
3.获取车型配置页面
4.解析配置页面内容(这步最复杂,使用了之前一些大神的代码)
二、代码
测试完美运行
import requests import json import xlwt from bs4 import BeautifulSoup import re from urllib import parse from selenium import webdriver class Car_home_config(object): def __init__(self): self.session = requests.Session() self.params = None self.brand_dict = {} self.series_dict = {} self.brand_name = None def get_header(self): self.headers = { "authority": "car.autohome.com.cn", "method": "GET", "path": "/AsLeftMenu/As_LeftListNew.ashx?%s" % parse.urlencode(self.params), "scheme": "https", "accept": "*/*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", "cache-control": "no-cache", "pragma": "no-cache", "sec-ch-ua": "Google Chrome;v=87,Not;A Brand;v=99,Chromium;v=87", "sec-ch-ua-mobile": "?0", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "none", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36" } # 获取所有品牌id号 def get_brand_id(self): self.params = { "typeId": "1", "brandId": "0", "fctId": "0", "seriesId": "0" } self.get_header() url = r"https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx" res = self.session.get(url=url, headers=self.headers, params=self.params) res.encoding = res.apparent_encoding html = res.text # print(html) soup = BeautifulSoup(html, 'lxml') ul_list = soup.find_all("ul") for ul in ul_list: li_list = ul.find_all("li") for li in li_list: a_href = li.find("a").attrs.get('href') a_text = li.find("a").text # print(a_href) # print(a_text) brand_id = re.findall("[0-9]d*", a_href)[0] self.brand_dict[brand_id] = a_text return self.brand_dict def get_AsLeftMenu(self): url = r"https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx" res = self.session.get(url=url, headers=self.headers, params=self.params) res.encoding = res.apparent_encoding html = res.text soup = BeautifulSoup(html, 'lxml') dd_list = soup.find_all("dd") for dd in dd_list: a_list = dd.find_all("a") for a in a_list: a_href = a.attrs.get('href') a_text = a.text print(a_href) print(a_text) series_id = re.findall("[0-9]d*", a_href)[0] self.series_dict[series_id] = a_text # 获取某一品牌下车型的id号 def get_series_id(self): self.get_brand_id() if self.brand_name: for k, v in self.brand_dict.items(): if self.brand_name in v: self.params = { "typeId": "1", "brandId": k, "fctId": "0", "seriesId": "0" } self.get_header() self.get_AsLeftMenu() return self.series_dict else: for k, v in self.brand_dict.items(): self.params = { "typeId": "1", "brandId": k, "fctId": "0", "seriesId": "0" } self.get_header() self.get_AsLeftMenu() return self.series_dict # 获取车型配置信息 def get_config_content(self, series_id): res = self.session.get(r"https://car.autohome.com.cn/config/series/{}.html".format(series_id), verify=False, headers={ "authority": "car.autohome.com.cn", "method": "GET", "path": "/config/series/{}.html".format(series_id), "scheme": "https", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", "cache-control": "no-cache", "referer": "https://www.autohome.com.cn/", "sec-ch-ua": "Google Chrome;v=87,Not;A Brand;v=99,Chromium;v=87", "sec-ch-ua-mobile": "?0", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-site", "ec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"}) html = res.content.decode("utf-8") return html def car_info(self, html): config = re.search("var config = (.*?)};", html) # 车的参数 option = re.search("var option = (.*?)};", html) # 主被动安全装备 bag = re.search("var bag = (.*?)};", html) # 选装包 # 处理汽车参数 car_info = "" if config and option and bag: car_info = car_info + config.group(0) + option.group(0) + bag.group(0) return car_info def write_html(self, js_list, car_info): # 运行JS的DOM -- 这部破解是最麻烦的,非常耗时间~参考了互联网上的大神代码 DOM = ("var rules = '2';" "var document = {};" "function getRules(){return rules}" "document.createElement = function() {" " return {" " sheet: {" " insertRule: function(rule, i) {" " if (rules.length == 0) {" " rules = rule;" " } else {" " rules = rules + '#' + rule;" " }" " }" " }" " }" "};" "document.querySelectorAll = function() {" " return {};" "};" "document.head = {};" "document.head.appendChild = function() {};" "var window = {};" "window.decodeURIComponent = decodeURIComponent;") # 把JS文件写入到文件中去 for item in js_list: DOM = DOM + item html_type = "<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8' /><head></head><body> <script type='text/javascript'>" # 拼接成一个可以运行的网页 js = html_type + DOM + " document.write(rules)</script></body></html>" # 再次运行的时候,请把文件删除,否则无法创建同名文件,或者自行加验证即可 with open("original.html", "w", encoding="utf-8") as f: f.write(js) try: driver = webdriver.PhantomJS( executable_path=r"phantomjs.exe") driver.get("original.html") # 读取body部分 text = driver.find_element_by_tag_name('body').text if not text: return except Exception as e: print(e) finally: driver.close() # 匹配车辆参数中所有的span标签 span_list = re.findall("<span(.*?)></span>", car_info) # car_info 是我上面拼接的字符串 # 按照span标签与text中的关键字进行替换 for span in span_list: # 这个地方匹配的是class的名称 例如 <span class='hs_kw7_optionZl'></span> 匹配 hs_kw7_optionZl 出来 info = re.search("'(.*?)'", span) if info: class_info = str(info.group( 1)) + "::before { content:(.*?)}" # 拼接为 hs_kw7_optionZl::before { content:(.*?)} content = re.search(class_info, text).group(1) # 匹配文字内容,返回结果为 "实测""油耗""质保" car_info = car_info.replace(str("<span class='" + info.group(1) + "'></span>"), re.search(""(.*?)"", content).group(1)) return car_info def save(self, car_info, car_name, save_path): # 持久化 car_item = {} config = re.search("var config = (.*?);", car_info).group(1) option = re.search("var option = (.*?);var", car_info).group(1) bag = re.search("var bag = (.*?);", car_info).group(1) config_re = json.loads(config) option_re = json.loads(option) bag_re = json.loads(bag) config_item =[] option_item = [] for i in config_re['result']['paramtypeitems']: config_item+=i['paramitems'] for i in option_re['result']['configtypeitems']: option_item+=i['configitems'] # bag_item = bag_re['result']['bagtypeitems'][0]['bagitems'] for car in config_item: car_item[car['name']] = [] for value in car['valueitems']: car_item[car['name']].append(value['value']) for car in option_item: car_item[car['name']] = [] for value in car['valueitems']: car_item[car['name']].append(value['value']) # for car in bag_item[0]['valueitems']: # car_item[car['name']] = [] # car_item[car['name']].append(car['bagid']) # car_item[car['name']].append(car['pricedesc']) # car_item[car['name']].append(car['description']) # 生成表格 workbook = xlwt.Workbook(encoding='ascii') # 创建一个文件 worksheet = workbook.add_sheet('汽车之家') # 创建一个表 cols = 0 start_row = 0 for co in car_item: worksheet.write(start_row, cols, co) # 在第0(一)行写入车的配置信息 cols = cols + 1 end_row_num = start_row + len(car_item['车型名称']) # 车辆款式记录数 for row in range(start_row, end_row_num): col_num = 0 # 列数 row += 1 for col in car_item: try: con = str(car_item[col][row - 1]) except: con = "" worksheet.write(row, col_num, con) col_num = col_num + 1 workbook.save('{}/{}.xls'.format(save_path, car_name)) # 查找车型配置,brand_name不填就是查找所有 def check(self, brand_name, save_path="./"): self.brand_name = brand_name self.get_series_id() for series_id, car_name in self.series_dict.items(): print(series_id, car_name) html = self.get_config_content(series_id) car_info = self.car_info(html) js_list = re.findall('((function([a-zA-Z]{2}.*?_).*?(document);)', html) car_info = self.write_html(js_list, car_info) if car_info: self.save(car_info, car_name, save_path) car = Car_home_config() car.check("奥迪")
phantomjs.exe下载地址:https://phantomjs.org/download.html
感谢以下作者:
https://www.cnblogs.com/kangz/p/10011348.html
https://www.cnblogs.com/pontoon/p/10459471.html