• 爬虫四 selenium + phantomjs & Headless Chrome


    一、selenium操作谷歌浏览器

     1 from selenium import  webdriver
     2 import time
     3 
     4 '''模拟创建一个浏览器对象,通过对象操作浏览器'''
     5 browser = webdriver.Chrome()
     6 print(browser)
     7 
     8 # path =  r'‪D:googlediverchromedriver.exe'
     9 # browser = webdriver.Chrome(executable_path= path)
    10 
    11 url = 'https://www.baidu.com/'
    12 browser.get(url)
    13 time.sleep(3)
    14 
    15 '''查找输入框'''
    16 input = browser.find_element_by_id('kw')
    17 '''输入文字'''
    18 input.send_keys('菊花')
    19 '''查找搜索按钮,并点击'''
    20 button = browser.find_element_by_id('su')
    21 button.click()
    22 time.sleep(3)
    23 '''找到指定图片点击'''
    24 img = browser.find_element_by_class_name('op-img-address-link-imgs')
    25 img.click()
    26 time.sleep(5)
    27 
    28 '''退出浏览器'''
    29 # browser.quit()

    二、selenium操作phantomjs

     1 from selenium import  webdriver
     2 import time
     3 
     4 '''创建浏览器对象,通过对象操作浏览器'''
     5 browser = webdriver.PhantomJS()
     6 time.sleep(3)
     7 
     8 '''打开百度'''
     9 # url = 'https://www.baidu.com/'
    10 # browser.get(url)
    11 # time.sleep(3)
    12 
    13 '''截图'''
    14 # browser.save_screenshot(r'image/baidu.png')
    15 # time.sleep(2)
    16 
    17 '''查找输入框'''
    18 # input = browser.find_element_by_id('kw')
    19 '''输入文字'''
    20 # input.send_keys('菊花')
    21 # browser.save_screenshot(r'image/ju.png')
    22 # time.sleep(2)
    23 '''查找搜索按钮,并点击'''
    24 # button = browser.find_element_by_id('su')
    25 # button.click()
    26 # time.sleep(3)
    27 # browser.save_screenshot(r'image/hua.png')

    三、phantomjs下拉滚动条

    from selenium import  webdriver
    import time
    
    '''创建浏览器对象,通过对象操作浏览器'''
    browser = webdriver.PhantomJS()
    time.sleep(3)
    
    url = 'https://dig.chouti.com/all/man/'
    browser.get(url)     #打开抽屉
    time.sleep(3)
    browser.save_screenshot(r'image/chouti.png')    #截图
    
    #下拉滚动条
    js = 'document.body.scrollTop=10000'
    browser.execute_script(js)
    time.sleep(3)
    
    browser.save_screenshot(r'image/chouti2.png')    #截图
    
    '''获取网页代码,保存到文件'''
    html = browser.page_source
    
    with open(r'image/chouti.html','w',encoding='utf8') as fp:
        fp.write(html)

    四、例子-下拉式动态加载

    from selenium import  webdriver
    import time
    
    '''创建浏览器对象'''
    browser = webdriver.PhantomJS()
    time.sleep(3)
    
    #建筑图片
    url = 'http://sc.chinaz.com/tupian/tesejianzhutupian.html'
    
    '''打开网页,保存代码'''
    browser.get(url)     #打开网页
    time.sleep(3)
    with open(r'html/jianzhu1.html','w',encoding='utf8') as fp:
        fp.write(browser.page_source)
    
    '''下拉滚动条,再次保存代码'''
    js = 'document.body.scrollTop=10000'
    browser.execute_script(js)
    time.sleep(3)
    with open(r'html/jianzhu2.html','w',encoding='utf8') as fp:
        fp.write(browser.page_source)
    
    '''由于页面动态加载,所以下拉滚动条前后保存的代码是不一样的'''

    五、例子-点击加载更多

    from selenium import  webdriver
    import time
    
    '''创建浏览器对象'''
    browser = webdriver.PhantomJS()
    time.sleep(3)
    
    #豆瓣经典电影
    url = 'https://movie.douban.com/explore#!type=movie&tag=%E7%BB%8F%E5%85%B8&sort=recommend&page_limit=20&page_start=20'
    
    '''打开网页,保存代码'''
    browser.get(url)     #打开网页
    time.sleep(3)
    with open(r'html/dianying1.html','w',encoding='utf8') as fp:
        fp.write(browser.page_source)
    
    '''点击加载更多,再次保存代码'''
    show_more = browser.find_element_by_class_name('more')    #查找“加载更多”按钮
    show_more.click()   #点击
    time.sleep(3)
    with open(r'html/dianying2.html','w',encoding='utf8') as fp:
        fp.write(browser.page_source)

    六、headless chrome的使用

    from selenium import  webdriver
    from selenium.webdriver.chrome.options import Options
    import time
    
    '''实例化一个参数对象,使得浏览器以无界面模式打开'''
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    
    '''谷歌驱动路径 '''
    path = 'D:googlediverchromedriver.exe'
    
    '''创建浏览器对象'''
    browser = webdriver.Chrome(executable_path=path,chrome_options=chrome_options)
    
    '''访问网页'''
    url = 'http://www.baidu.com/'
    browser.get(url)
    time.sleep(3)
    
    browser.save_screenshot('image/wu.png')
    
    browser.quit()
  • 相关阅读:
    ActiveMQ
    Solr学习笔记(4) —— SolrCloud的概述和搭建
    Solr学习笔记(3) —— SolrJ管理索引库&集群
    JAVA 平台
    JMS(Java平台上的专业技术规范)
    zookeeper 分布式管理
    java 类型转换
    聚集索引 非聚类索引 区别 二
    聚集索引 非聚类索引 区别
    阶乘
  • 原文地址:https://www.cnblogs.com/Finance-IT-gao/p/11130816.html
Copyright © 2020-2023  润新知