• selenium知识点


    1. 导包

     from selenium import webdriver
     from selenium.webdriver.support.ui import WebDriverWait
     from selenium.webdriver.support import expected_conditions as ec
     from selenium.webdriver.common.by import By

    2. 创建 webdriver

    driver = webdriver.Chrome(executable_path=r'C:python35chromedriver.exe')
    # executable_path:chromedriver.exe的存放路径

    3. 操作

    1. 常用操作

    driver.page_source   # 响应页面,element标签
    driver.title         # 请求页面的title
    driver.current_url   # 请求的url
    driver.close()       # 关闭浏览器窗口
    driver.quit()        # 退出浏览器
    driver.current_window_handle  # 获取当前窗口的句柄
    driver.window_handles         # 获取所有窗口的句柄
    driver.switch_to_window()   # 切换浏览器窗口
    driver.execute_script()     # 执行js脚本(在打开新窗口时用)
    driver.get_cookie()         # 获取cookie
    driver.find_element_by_class_name()   # 根据class查找标签
    driver.find_element_by_xpath()        # 根据xpath查找标签

    4 例子 -- 爬去拉钩网招聘信息

    过程: 用 selenium 访问首页,然后用lxml解析首页上的每一个职位,获取职位的链接地址,然后再用selenium 访问该地址,用lxml提取职位详情的信息。

    • get_attribute('class')  # 获取标签的class属性值
    • self.driver.execute_script('window.open("https://www.baidu.com/")')  # 打开一个新窗口 (请求详情页)  通过执行js脚本打开新的窗口
    • self.driver.switch_to.window(self.driver.window_handles[1])   # 将driver切换到新窗口(详情页窗口)
    • WebDriverWait(driver=self.driver, timeout=10). 
                  until(ec.presence_of_element_located((By.XPATH, '//span[@class="name"]')))  # 显示等待(职位详情页,出现职位标题为条件),在等待过程中如果出现要寻找的标签,则结束等待,如果没有出现则一直等待,直到10s,抛出异常。注意:这里只能定位某个标签,不能取标签里面的值,如这里的span标签中的值。
    • 注意:在切换到新窗口后,如果想继续在原始窗口执行操作,需要再重新切回原始窗口,如这里的由详情页窗口切换到列表页窗口。
    import re
    import time
    import csv
    from lxml import etree
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as ec
    from selenium.webdriver.support.ui import WebDriverWait
    
    
    class LaGouSpider:
        def __init__(self):
            self.driver = webdriver.Chrome(executable_path=r'C:python35chromedriver.exe')
            self.init_url = 'https://www.lagou.com/zhaopin/Python/'
            self.next_page = True
            self.position = None
            self.csv_header = ['职位名称', '职位要求', '薪水', '职位标签', '职位诱惑', '职位详情', '发布时间']
            self.is_writer_header = False
    
        def request_list_page(self, url=None):
            if url:
                self.driver.get(url)
            html = etree.HTML(self.driver.page_source)
            # 解析html 获取职位列表
            links = html.xpath('//a[@class="position_link"]/@href')
            for link in links:
                self.request_detail_page(link)
                time.sleep(1)
            # 下一页
            next_btn = self.driver.find_element_by_xpath('//div[@class="pager_container"]/a[last()]')
            if 'page_no pager_next_disabled' not in next_btn.get_attribute('class'):
                next_btn.click()
            else:
                self.next_page = False
    
        def request_detail_page(self, url):
            # 打开一个新窗口 (请求详情页)
            self.driver.execute_script('window.open("' + url + '")')
            # 将driver切换到新窗口(详情页窗口)
            self.driver.switch_to.window(self.driver.window_handles[1])
            # 显示等待(职位详情页,出现职位标题为条件)
            WebDriverWait(driver=self.driver, timeout=10). 
                until(ec.presence_of_element_located((By.XPATH, '//span[@class="name"]')))
            # 解析详情页
            self.parse_detail_page(self.driver.page_source)
            # 关闭新窗口
            self.driver.close()
            # 重新将窗口切回到列表页窗口
            self.driver.switch_to.window(self.driver.window_handles[0])
    
        def parse_detail_page(self, source):
            html = etree.HTML(source)
            position_name = html.xpath('//span[@class="name"]/text()')
            job_request = html.xpath('//dd[@class="job_request"]/p//text()')
            position_label = html.xpath('//ul[@class="position-label clearfix"]//text()')
            publish_time = html.xpath('//p[@class="publish_time"]//text()')
            job_advantage = html.xpath('//dd[@class="job-advantage"]/p/text()')
            job_detail = html.xpath('//div[@class="job-detail"]/p/text()')
    
            # 清洗数据
            position_name = position_name[0] if position_name else None
            job_request = [re.sub('
    |/|\xa0', '', i).strip() for i in job_request] if job_request else None
            job_request = [i for i in job_request if i != '']
            # 提取salary
            salary = job_request[0] if len(job_request) > 2 else None
            # 提取职位需求
            job_request = ','.join(job_request[2:5])
            # 职位标签
            position_label = [re.sub('
    |/|\xa0', '', i).strip() for i in position_label] if position_label else None
            position_label = ','.join([i for i in position_label if i != ''])
            # 职位诱惑
            job_advantage = job_advantage[0] if job_advantage else None
            # 发布时间
            print(publish_time)
    
            print(salary)
            publish_time = re.match(r'd+天w|d+:d+', publish_time[0]).group() if publish_time else  None
            # 职位详情
            job_detail = ','.join([a.strip() for a in job_detail] if job_detail else None)
            position = {
                '职位名称': position_name,
                '职位要求': job_request,
                '薪水': salary,
                '职位标签': position_label,
                '职位诱惑': job_advantage,
                '职位详情': job_detail,
                '发布时间': publish_time,
            }
            self.write_csv(position)
    
        def write_csv(self, position):
            with open('position.csv', 'a+', encoding='utf-8', newline='') as f:
                writer = csv.DictWriter(f, self.csv_header)
                if not self.is_writer_header:
                    writer.writeheader()
                    self.is_writer_header = True
                writer.writerow(position)
    
        def run(self):
            self.request_list_page(self.init_url)
            while self.next_page:
                self.request_list_page()
    
    if __name__ == '__main__':
        lagou = LaGouSpider()
        lagou.run()

     5. Selenium 的 WebDriverWait

    https://blog.csdn.net/duzilonglove/article/details/78455051
  • 相关阅读:
    数据库设计
    java 的继承,深入理解
    ant 使用笔记
    Effective C++ 精要(第七部分:模板与泛型编程)
    Effective C++ 精要(第八部分:定制new和delete)
    求数组的子数组之和的最大值
    Effective C++ 精要(第四部分:设计与声明)
    STL的容器中存储对象和指针的利和弊
    (zz)Why Memory Barrier
    理解smart pointer之二:如何实现一个smart pointer
  • 原文地址:https://www.cnblogs.com/yuqiangli0616/p/10334746.html
Copyright © 2020-2023  润新知