• selenium模块


    浏览器驱动

    from selenium import webdriver  # 用来驱动浏览器的
    from selenium.webdriver import ActionChains  # 破解滑动验证码的时候用的 可以拖动图片
    from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.common.keys import Keys  # 键盘按键操作
    from selenium.webdriver.support import expected_conditions as EC  # 和下面WebDriverWait一起用的
    from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
    import time
    #获得驱动
    chrome = webdriver.Chrome()

    显隐等待

    隐式等待:在browser.get('xxx')前就设置,针对所有元素有效
    显式等待:在browser.get('xxx')之后设置,只针对某个元素有效
    
    chrome.implicitly_wait(10) :之前
    wait=WebDriverWait(browser,10) :之后

    网页前进后退

    try:
        chrome.get('https://www.baidu.com/')
        chrome.get('https://www.tmall.com/')
        chrome.get('https://www.jd.com/')
    
        # 后退
        chrome.back()
        # 前进
        chrome.forward()
        time.sleep(5)
    finally:
        chrome.close()

    JS操作

    try:
    
        chrome.get('https://www.baidu.com/')
    
        chrome.execute_script("alert('傻眼了吧!')")
        # chrome.execute_script("""
        # scasfaf
        # """)
    
        time.sleep(5)
    finally:
        chrome.close()

    get_elements_by_xpath

    操作示例:
    <html> <head> <base href='http://example.com/' /> <title>Example website</title> </head> <body> <div id='images'> <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a> <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a> <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a> <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a> </div> </body> </html>

      使用路径表达式来选取 XML 文档中的节点或节点集

    方法:tag_anme ,text ,get_attribute[] ,img.location

    1.从根节点查找: /
    2.从全局查找: //
    3.查找某一层的下一层: //a/img
    4.查找多个: get_elements_by_xpath('//a') ,得到一个列表
    5.查找第3个a标签的img: get_elements_by_xpath('//div/a[3]/img') 不是按照索引;
    6.查找id属性: get_elements_by_xpath('//*[@id = "imgages"]/a[3]/img')
    
    

    交互操作

    from selenium import webdriver  # 用来驱动浏览器的
    from selenium.webdriver import ActionChains  # 破解滑动验证码的时候用的 可以拖动图片
    from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.common.keys import Keys  # 键盘按键操作
    from selenium.webdriver.support import expected_conditions as EC  # 和下面WebDriverWait一起用的
    from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
    import time
    
    chrome = webdriver.Chrome()
    chrome.implicitly_wait(10)
    try:
    
        chrome.get('https://www.tmall.com/')
        input_tag = chrome.find_element_by_id('mq')
        input_tag.send_keys('时间革命')
        input_tag.send_keys(Keys.ENTER)
    
        input_tag = chrome.find_element_by_id('mq')
        input_tag.clear()
        input_tag.send_keys('唐诗三百首')
    
        button = chrome.find_element_by_xpath('//*[@class="mallSearch-input clearfix"]/button')
        button.click()
    
        time.sleep(5)
    
    finally:
        chrome.close()
    自动查找商品
    try:
    
        chrome.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
        # source = chrome.find_element_by_id('draggable')
        # target = chrome.find_element_by_id('droppable')
        # print(source, target)
        # 切换子页面
        # chrome.switch_to_frame('iframeResult')  # 弃用方法
        chrome.switch_to.frame('iframeResult')
        source = chrome.find_element_by_id('draggable')
        target = chrome.find_element_by_id('droppable')
        print(source, target)
    
        # 找父页面
        # chrome.switch_to.parent_frame()
        # source = chrome.find_element_by_id('draggable')
        # target = chrome.find_element_by_id('droppable')
        # print(source, target)
    
        # 方式一:
        # ActionChains(chrome).drag_and_drop(source, target).perform()
    
        # 方式一:
        ActionChains(chrome).click_and_hold(source).perform()
        distance = target.location.get('x') - source.location.get('x')
        s = 0
        while s < distance:
            ActionChains(chrome).move_by_offset(xoffset=1, yoffset=0).perform()
            s += 1
    
        ActionChains(chrome).release().perform()
    
        time.sleep(5)
    
    finally:
        chrome.close()
    自动校验移动验证码

    爬取京东商品信息

    from selenium import webdriver  # 用来驱动浏览器的
    from selenium.webdriver import ActionChains  # 破解滑动验证码的时候用的 可以拖动图片
    from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.common.keys import Keys  # 键盘按键操作
    from selenium.webdriver.support import expected_conditions as EC  # 和下面WebDriverWait一起用的
    from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
    import time
    
    def drver_star(driver, key):
    
        try:
            div_obj = driver.find_element_by_id('J_goodsList')
            li_list = div_obj.find_elements_by_class_name('gl-item')
            # print(li_list)
            for li in li_list:
                # 商品链接
                detail_link = li.find_element_by_css_selector('.p-img a').get_attribute('href')
    
                # 商品名称
                g_name = li.find_element_by_css_selector('.p-name em').text
    
                # 商品价格
                g_price = li.find_element_by_css_selector('.p-price i').text
    
                # 评论人数
                g_commit = li.find_element_by_css_selector('.p-commit a').text
    
                goods = '''
    
                ==============tank 商品信息 ================
                    商品链接: %s
                    商品名称: %s
                    商品价格: %s
                    评论人数: %s
                
    
                ''' % (detail_link, g_name, g_price, g_commit)
                print(goods)
    
                with open('%s.txt' % key, 'a', encoding='utf-8') as f:
                    f.write(goods)
    
            next_tag = driver.find_element_by_partial_link_text('下一页')
            next_tag.click()
            time.sleep(2)
            drver_star(driver, key)
    
    
            time.sleep(5)
        finally:
            driver.close()
    
    if __name__ == '__main__':
        key = input('请输入爬取的商品内容: ').strip()
        driver = webdriver.Chrome()
        driver.implicitly_wait(10)
        driver.get('https://www.jd.com/')
        input_tag = driver.find_element_by_id('key')
        input_tag.send_keys(key)
        input_tag.send_keys(Keys.ENTER)
        drver_star(driver, key)
    商品信息
  • 相关阅读:
    材料用词积累
    SqlServer 数据库/数据表 拆分(分布式)【转】
    SqlServer 数据库读写分离【转】
    (整理)在REHL6.5上部署ASP.NET MVC
    (整理)MySQL_REHL6.5 安装MySQL5.5
    (转)查看SQLServer最耗资源时间的SQL语句
    (转)SQLServer查询数据库各种历史记录
    (转)SqlServer2008 数据库同步:发布、订阅
    (整理)SQL Server 2008 CDC 功能使用
    (整理)EF分页的实现
  • 原文地址:https://www.cnblogs.com/xuechengeng/p/10502623.html
Copyright © 2020-2023  润新知