>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
声明:仅学习参考
版本:verison_0
说明:主要是通过selenium拿到网页源码,然后通过lxml进行解析,大部分时间也花在解析网页源码提取数据上面和写逻辑上面了
技术:selenium,lxml,json
在xpath中如果要提取子节点的所有文本信息,可以用 "li.xpath('string(xpath_path)')"
效果图:
源码:
from selenium import webdriver import time import re import json from lxml import etree from urllib import parse from pprint import pprint class QuNaErSpider(): """获取当日北京的酒店信息""" def __init__(self): self.driver = webdriver.Chrome() def save_info(self,content): with open("qunaer_hotel_today_info.json",'a+',encoding='utf-8') as f: f.write(json.dumps(obj=content,ensure_ascii=False,indent=4)) f.write(",") print("写入完成") def parse_html(self,html_str,source_url): html_etree = etree.HTML(text=html_str) li_list = html_etree.xpath('//ul[contains(@id,"hotel_lst_body")]/li') current_page_info_list = list() for li in li_list: item = dict() hotel_name = li.xpath('.//div[@class="cont"]/p[@class="name"]/a/@title') item["hotel_name"] = hotel_name[0] if hotel_name else None hotel_href = li.xpath('.//div[@class="cont"]/p[@class="name"]/a/@href') item['totel_href'] = parse.urljoin(base=source_url,url=hotel_href[0]) if hotel_href else None hotel_type = li.xpath('//div[@class="cont"]/p[@class="name"]/span[last()]/text()') item['total_type'] = hotel_type[0] if hotel_type else None hotel_price = li.xpath('string(.//p[@class="price_new"])') item['hotel_price'] = hotel_price if hotel_price else None hotel_address = li.xpath('.//div[@class="cont"]/p[@class="adress"]/text()') item["hotel_address"] = hotel_address[0] if hotel_address else None hotel_comment = li.xpath('string(.//div[@class="cont"]/p[@class="comm"])') item['hotel_comment'] = hotel_comment if hotel_comment else None hotel_subject = li.xpath('string(.//div[@class="cont"]/div[@class="subj rmb"])') item['hotel_subject'] = hotel_subject if hotel_subject else None current_page_info_list.append(item) return current_page_info_list def into_first_page(self,driver,url=None): driver.get(url) hotel_element = driver.find_element_by_xpath('//div[contains(@class,"q_header_mnav")]/ul/li[3]') hotel_element.click() search_button = driver.find_element_by_xpath('//div[@class="G_searchIndex fl_left"]//div[@class="btn clearfix"]') search_button.click() time.sleep(1) return driver def run(self): root_url = "https://www.qunar.com/" driver = self.into_first_page(driver=self.driver,url=root_url) current_page_info_list = self.parse_html(html_str=driver.page_source,source_url=driver.current_url) self.save_info(current_page_info_list) nextpage_button = driver.find_element_by_xpath('//p[@class="next fl_right cur able"]') while nextpage_button: nextpage_button.click() time.sleep(1) current_page_info_list = self.parse_html(html_str=driver.page_source,source_url=driver.current_url) self.save_info(current_page_info_list) try: nextpage_button = driver.find_element_by_xpath('//p[@class="next fl_right cur able"]') except Exception: nextpage_button = None driver.quit() if __name__ == "__main__": obj = QuNaErSpider() obj.run()
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<