• 爬虫请求库之selenium


    一:简介

    1:介绍

    (1)selenium最初是测试工具

    (2)爬虫使用该模块的原因是request无法操作js代码 而selenium可以操作js代码

    (3)selenium本质是操作浏览器内核 完全模拟浏览器行为 例如 输入内容 点击等

    (4)因为直接操作浏览器 我们无需考虑请求头等

    2:支持的浏览器

    from selenium import webdriver
    browser=webdriver.Chrome()
    browser=webdriver.Firefox()
    browser=webdriver.PhantomJS()
    browser=webdriver.Safari()
    browser=webdriver.Edge() 

    二:安装使用

    1:安装

    pip3 install selenium
    下载chromdriver.exe放到python安装路径的scripts目录中即可
    国内镜像网站地址:http://npm.taobao.org/mirrors/chromedriver/2.38/
    最新的版本去官网找:https://sites.google.com/a/chromium.org/chromedriver/downloads

    #注意:
    selenium3默认支持的webdriver是Firfox,而Firefox需要安装geckodriver
    下载链接:https://github.com/mozilla/geckodriver/releases

    2:基本使用

    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.common.keys import Keys #键盘按键操作
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
    1、find_element_by_id   根据id找
    2、find_element_by_link_text     根据链接名字找到控件(a标签的文字)
    3、find_element_by_partial_link_text   根据链接名字找到控件(a标签的文字)模糊查询
    4、find_element_by_tag_name       根据标签名
    5、find_element_by_class_name     根据类名
    6、find_element_by_name           根据属性名 例如 name = 'xxx'
    7、find_element_by_css_selector   根据css选择器
    8、find_element_by_xpath          根据xpath选择

    3:显示等待与隐式等待的作用

    (1)大部分网页可能都是由ajax + js开发的 加载需要一定的时间 当我们通过代码进行操作的时候 可能有的标签还没渲染出来

     (2)通过设置等待时间让标签能够被加载出来

    4:显示等待

    (1)设置最大的等待时间 

    (2)如果指定查询的元素在规定时间内查找出来 便会执行下一行代码

    (3)如果在规定时间内没有查询出指定的元素便会抛出异常TimeoutException

    wait=WebDriverWait(browser,10)
    wait.until(EC.presence_of_element_located((By.ID,'content_left')))

    5:隐式等待

    (1)设置最大的等待时间

    (2)如果在规定时间内完成页面加载 会执行下一步

    (3)否则会一直等到时间结束才会执行下一步

    from selenium import webdriver
    
    bro=webdriver.Chrome()
    bro.get("http://www.baidu.com")
    bro.implicitly_wait(10)   # 表示等待所有 等待时间10s中

    4:模拟百度进行登录案例

    from selenium import webdriver
    import time
    
    
    def login(url,browser):
        login_button = browser.find_element_by_link_text('登录')  # 查找登录标签
    
        login_button.click()  # 点击登录
    
        time.sleep(1)
    
        login_type = browser.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')  # 选择用户名密码登录的方式
        login_type.click()
    
    
        input_username = browser.find_element_by_id('TANGRAM__PSP_10__userName')  # 获取输入用户名框
        input_username.send_keys("123")  # 输入用户名
        input_password = browser.find_element_by_id('TANGRAM__PSP_10__password')  # 获取输入用户密码框
        input_password.send_keys("123")  # 输入用户密码
        login_submit = browser.find_element_by_id('TANGRAM__PSP_10__submit')  # 用户名密码输入成功 进行登录
    
        login_submit.click()  # 点击登录
    
        cookie = browser.get_cookies()  # 获取所有的cookies  如果获取单个cookie需要知道某个cookie的名称
    
        browser.close()   # 进行关闭当前浏览器
    
    
    
    if __name__ == '__main__':
        browser = webdriver.Chrome()
    
        browser.implicitly_wait(10)  # 等待时间10s
    
        url = browser.get('https://www.baidu.com')  # 进行百度请求
    
        login(url,browser)
    模拟百度登录

     三:xpath

    doc='''
    <html>
     <head>
      <base href='http://example.com/' />
      <title>Example website</title>
     </head>
     <body>
      <div id='images'>
       <a href='image1.html' a="xxx">Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
       <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
       <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
       <a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
       <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
       <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
      </div>
     </body>
    </html>
    '''
    from lxml import etree
    
    html=etree.HTML(doc)
    # html=etree.parse('search.html',etree.HTMLParser())
    # 1 所有节点
    a=html.xpath('//*')    #匹配所有标签
    # 2 指定节点(结果为列表)
    # a=html.xpath('//head')
    # 3 子节点,子孙节点
    a=html.xpath('//div/a')
    a=html.xpath('//body/a') #无数据
    a=html.xpath('//body//a')
    # 4 父节点
    # a=html.xpath('//body//a[@href="image1.html"]/..')
    a=html.xpath('//body//a[1]/..')  #从1开始
    # 也可以这样
    a=html.xpath('//body//a[1]/parent::*')
    # 5 属性匹配
    a=html.xpath('//body//a[@href="image1.html"]')
    
    # 6 文本获取
    a=html.xpath('//body//a[@href="image1.html"]/text()')
    a=html.xpath('//body//a/text()')
    
    # 7 属性获取
    # a=html.xpath('//body//a/@href')
    # # 注意从1 开始取(不是从0)
    a=html.xpath('//body//a[2]/@href')
    # 8 属性多值匹配
    #  a 标签有多个class类,直接匹配就不可以了,需要用contains
    # a=html.xpath('//body//a[@class="li"]')
    a=html.xpath('//body//a[contains(@class,"li")]/text()')
    # a=html.xpath('//body//a[contains(@class,"li")]/text()')
    # 9 多属性匹配
    a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
    a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
    a=html.xpath('//body//a[contains(@class,"li")]/text()')
    # 10 按序选择
    a=html.xpath('//a[2]/text()')
    a=html.xpath('//a[2]/@href')
    # 取最后一个
    a=html.xpath('//a[last()]/@href')
    # 位置小于3的
    a=html.xpath('//a[position()<3]/@href')
    # 倒数第二个
    a=html.xpath('//a[last()-2]/@href')
    # 11 节点轴选择
    # ancestor:祖先节点
    # 使用了* 获取所有祖先节点
    a=html.xpath('//a/ancestor::*')
    # # 获取祖先节点中的div
    a=html.xpath('//a/ancestor::div')
    # attribute:属性值
    a=html.xpath('//a[1]/attribute::*')
    # child:直接子节点
    a=html.xpath('//a[1]/child::*')
    # descendant:所有子孙节点
    a=html.xpath('//a[6]/descendant::*')
    # following:当前节点之后所有节点
    a=html.xpath('//a[1]/following::*')
    a=html.xpath('//a[1]/following::*[1]/@href')
    # following-sibling:当前节点之后同级节点
    a=html.xpath('//a[1]/following-sibling::*')
    a=html.xpath('//a[1]/following-sibling::a')
    a=html.xpath('//a[1]/following-sibling::*[2]/text()')
    a=html.xpath('//a[1]/following-sibling::*[2]/@href')
    
    print(a)
    xpath

    四:获取元素属性

    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.common.keys import Keys #键盘按键操作
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
    
    browser=webdriver.Chrome()
    
    browser.get('https://www.amazon.cn/')
    
    wait=WebDriverWait(browser,10)
    wait.until(EC.presence_of_element_located((By.ID,'cc-lm-tcgShowImgContainer')))
    
    tag=browser.find_element(By.CSS_SELECTOR,'#cc-lm-tcgShowImgContainer img')
    
    #获取标签属性,
    print(tag.get_attribute('src'))
    
    #获取标签ID,位置,名称,大小(了解)
    print(tag.id)
    print(tag.location)
    print(tag.tag_name)
    print(tag.size)
    
    browser.close()
    获取元素属性

    五:元素交互操作

    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.common.keys import Keys #键盘按键操作
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
    
    browser=webdriver.Chrome()
    browser.get('https://www.amazon.cn/')
    wait=WebDriverWait(browser,10)
    
    
    
    input_tag=wait.until(EC.presence_of_element_located((By.ID,'twotabsearchtextbox')))
    input_tag.send_keys('iphone 8')
    button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input')
    button.click()
    
    import time
    time.sleep(3)
    
    input_tag=browser.find_element_by_id('twotabsearchtextbox')
    input_tag.clear() #清空输入框
    input_tag.send_keys('iphone7plus')
    button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input')
    button.click()
    元素交互信息
    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.common.keys import Keys  # 键盘按键操作
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
    import time
    
    driver = webdriver.Chrome()
    driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    wait=WebDriverWait(driver,3)
    # driver.implicitly_wait(3)  # 使用隐式等待
    
    try:
        driver.switch_to.frame('iframeResult') ##切换到iframeResult
        sourse=driver.find_element_by_id('draggable')
        target=driver.find_element_by_id('droppable')
    
    ```
    #方式一:基于同一个动作链串行执行
    # actions=ActionChains(driver) #拿到动作链对象
    # actions.drag_and_drop(sourse,target) #把动作放到动作链中,准备串行执行
    # actions.perform()
    
    #方式二:不同的动作链,每次移动的位移都不同
    ```
    
        ActionChains(driver).click_and_hold(sourse).perform()
        distance=target.location['x']-sourse.location['x']
    
    ```
    track=0
    while track < distance:
        ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
        track+=2
    
    ActionChains(driver).release().perform()
    
    time.sleep(10)
    ```
    
    finally:
        driver.close()
    
    Action Chains
    滑动验证码
    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.common.keys import Keys #键盘按键操作
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
    
    
    
    try:
        browser=webdriver.Chrome()
        browser.get('https://www.baidu.com')
        browser.execute_script('alert("hello world")') #打印警告
    finally:
        browser.close()
    
    在交互动作比较难实现的时候可以自己写JS(万能方法)
    操作js

    六:其他操作

    #cookies
    from selenium import webdriver
    
    browser=webdriver.Chrome()
    browser.get('https://www.zhihu.com/explore')
    print(browser.get_cookies())
    browser.add_cookie({'k1':'xxx','k2':'yyy'})
    print(browser.get_cookies())
    
    # browser.delete_all_cookies()
    模拟浏览器前进后退
    from selenium import webdriver
    
    browser=webdriver.Chrome()
    browser.get('https://www.zhihu.com/explore')
    print(browser.get_cookies())
    browser.add_cookie({'k1':'xxx','k2':'yyy'})
    print(browser.get_cookies())
    cookie
    import time
    from selenium import webdriver
    
    browser=webdriver.Chrome()
    browser.get('https://www.baidu.com')
    browser.execute_script('window.open()')
    
    print(browser.window_handles) #获取所有的选项卡
    browser.switch_to_window(browser.window_handles[1])
    browser.get('https://www.taobao.com')
    time.sleep(10)
    browser.switch_to_window(browser.window_handles[0])
    browser.get('https://www.sina.com.cn')
    browser.close()
    选项卡管理
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
    
    try:
        browser=webdriver.Chrome()
        browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
        browser.switch_to.frame('iframssseResult')
    
    except TimeoutException as e:
        print(e)
    except NoSuchFrameException as e:
        print(e)
    finally:
        browser.close()
    异常处理

     七:爬取示例

    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys  # 键盘按键操作
    import time
    import os, requests, hashlib
    
    path = str(time.time())
    
    
    def get_good(bro):
        goods_list = bro.find_elements_by_class_name('gl-item')
    
        for good in goods_list:
    
            good_detail_url = good.find_element_by_css_selector('.p-img a').get_attribute('href')  # 获取商品详情
    
            good_img_url = good.find_element_by_css_selector('.p-img a img').get_attribute('src')  # 获取商品图片
            if not good_img_url:  # 此只有当鼠标滑动到商品的时候才会进行加载出来
                good_img_url = 'https:' + good.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img')
            good_price = good.find_element_by_css_selector('.p-price i').text
            good_brief = good.find_element_by_css_selector('.p-name a ').get_attribute('title')
            good_comment = good.find_element_by_css_selector('.p-commit strong  a').text
    
            response = requests.get(good_img_url)
            good_name = good_brief.split(' ')[0][:5]
    
            md5 = hashlib.md5()
            md5.update(good_img_url.encode('utf-8'))
            file_name = '%s%s.jpg' % (good_name, md5.hexdigest())
            photo_path = 'photo'
            if not os.path.exists(photo_path):
                os.mkdir(photo_path)
    
            file_path = os.path.join(photo_path, file_name)
    
            with open(file_path, 'wb') as f:
                print("%s下载之中:" % good_brief)
                for line in response.iter_content():
                    f.write(line)
    
        next_page = bro.find_element_by_css_selector(".page .p-num .pn-next em")  # 点击下一页 继续获取商品
    
        time.sleep(1)
    
        next_page.click()
    
        time.sleep(1)
        get_good(bro)  # 循环调用函数
    
    
    if __name__ == '__main__':
    
        name = input('商品名>>:')
        bro = webdriver.Chrome()
        bro.get("https://www.jd.com")
        bro.implicitly_wait(10)
        search_input = bro.find_element_by_id('key')
    
        search_input.send_keys(name)
        search_input.send_keys(Keys.ENTER)
    
        try:
            print('商品获取中')
            get_good(bro)
        except Exception as e:
            print("结束")
        finally:
            bro.close()
    爬取京东商品
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys  # 键盘按键操作
    import time
    import pymysql
    
    
    def get_goods(bro):
        goods_lists = bro.find_elements_by_class_name('gl-item')
    
        return goods_lists
    
    
    def get_goods_info(goods_lists):
    
        for good in goods_lists:
    
    
            good_detail_url = good.find_element_by_css_selector('.p-img a').get_attribute('href')  # 获取商品详情
    
            good_img_url = good.find_element_by_css_selector('.p-img a img').get_attribute('src')  # 获取商品图片
            if not good_img_url:  # 此只有当鼠标滑动到商品的时候才会进行加载出来
                good_img_url = 'https:' + good.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img')
            good_price = good.find_element_by_css_selector('.p-price i').text
            good_brief = good.find_element_by_css_selector('.p-name a ').get_attribute('title')
            good_comment = good.find_element_by_css_selector('.p-commit strong  a').text
    
            return good_detail_url, good_img_url, good_price, good_comment, good_brief
    
    
    def write_database(good_detail_url, good_img_url, good_price, good_comment, good_brief):
        db = pymysql.connect(host="localhost", user="root",
                             password="123", db="syl", port=3306)
    
        cur = db.cursor()
    
        sql_insert = """insert into goods(good_detail,good_image,good_price,good_comment,good_brief) values({},{},{},{},{})""".format(
            repr(good_detail_url), repr(good_img_url), good_price, repr(good_comment), repr(good_brief))
    
        try:
            cur.execute(sql_insert)
    
            # 提交
            db.commit()
    
        except Exception as e:
            # 错误回滚
            print(e)
            db.rollback()
        finally:
    
            db.close()
    
    
    def next_get_good():
        next_page = bro.find_element_by_css_selector(".page .p-num .pn-next em")  # 点击下一页 继续获取商品
    
        time.sleep(1)
    
        next_page.click()
    
        time.sleep(1)
    
        bro.implicitly_wait(10)
    
        main(bro)
    
    
    def main(bro):
    
    
        goods_lists = get_goods(bro)
    
        response_good_info = get_goods_info(goods_lists)
    
        good_detail_url, good_img_url, good_price, good_comment, good_brief = response_good_info
    
    
        write_database( good_detail_url, good_img_url, good_price, good_comment, good_brief)
    
        next_get_good()
    
    if __name__ == '__main__':
    
    
    
        name = input('商品名>>:')
    
    
        bro = webdriver.Chrome()
    
        bro.implicitly_wait(10)
        bro.get("https://www.jd.com")
    
        search_input = bro.find_element_by_id('key')
    
        search_input.send_keys(name)
        search_input.send_keys(Keys.ENTER)
        try:
            main(bro)
        except Exception as e:
            print(e)
            print("结束")
        finally:
            bro.close()
    爬取京东商品存入数据库
    from selenium import webdriver
    import requests
    
    import os
    import hashlib
    
    path = 'photo'
    
    
    def get_url(base_url):
        browser.get(base_url)
    
        browser.implicitly_wait(10)
    
    
    def get_image_url():
        images_list = browser.find_elements_by_css_selector('.goods-item .figure-img img ')
    
        yield images_list
    
    
    def get_image(images):
        image = images.get_attribute('src')
    
        image_title = images.get_attribute('alt')
    
        yield image, image_title
    
    
    def download_image(image, image_title):
        if not os.path.exists(path):  # 判断存储路径是否存在
            os.mkdir(path)
    
        md5 = hashlib.md5()
        md5.update(image_title.encode('utf-8'))
    
        file_name = '%s%s.jpg' % (image_title, md5.hexdigest())  # 防止文件名重复
    
        file_path = os.path.join(path, file_name)  # 拼接文件路径
    
        response = requests.get(image)  # 请求图片数据流
        with open(file_path, 'wb') as f:
            print("%s下载之中:" % image_title)
            for line in response.iter_content():
                f.write(line)
    
    
    def main():
        for i in range(page_num):
            base_url = 'https://www.plmm.com.cn/tags-199-%s.html' % i
            get_url(base_url)
            images_list = list(get_image_url())[0]
            for images in images_list:
                images_detail = list(get_image(images))[0]
    
                image_detail, image_title = images_detail
    
                download_image(image_detail, image_title)
    
    
    if __name__ == '__main__':
    
        request_url = 'https://www.plmm.com.cn/tags-199-0.html'
        browser = webdriver.Chrome()
        browser.implicitly_wait(10)
        browser.get(request_url)
    
        page = browser.find_elements_by_class_name('page-num')   # 获取所有的页面
    
        page_num = len(page) + 1   # 通过len判断有多少页面 因为首页没有page-num +1补上首页
    
        try:
            main()
        except Exception as e:
            print(e)
        finally:
            print('爬取结束')
            browser.close()
    爬取漂亮美美网图片
  • 相关阅读:
    Maven学习
    Oracle_SQL函数-单行函数
    Java 8新特性-5 内建函数式接口
    量子优势
    配置Emeditor编译运行JAVA,附私家珍藏版
    配置Emeditor编译运行JAVA,附私家珍藏版
    Notepad2-mod,轻量级文本编辑器、代替记事本的最佳选择
    三星S7短信不能提示的处理方法
    三星S7短信不能提示的处理方法
    说说宾得机身的十大人性化设定和功能[转]
  • 原文地址:https://www.cnblogs.com/SR-Program/p/11944669.html
Copyright © 2020-2023  润新知