• python-selenium实现的简易下载器,并常见错误解决


    简易下载器的实现

    支持代理、失败重试、确保包含指定ID元素(可根据需求自定义修改)

    # coding: utf-8
    from Utils import logging
    from bs4 import BeautifulSoup as bs
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.common.proxy import ProxyType
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.ui import WebDriverWait
    
    class HtmlDownloader:
    	def __init__(self):
    		self.driver = webdriver.PhantomJS()
    
    	def setProxy(self, proxyStr):
    		# 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId
    		proxy=webdriver.Proxy()
    		proxy.proxy_type=ProxyType.MANUAL
    		proxy.http_proxy=proxyStr
    		# 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
    		proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
    		self.driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
    
    	def rmProxy(self):
    		# 还原为系统代理
    		proxy=webdriver.Proxy()
    		proxy.proxy_type=ProxyType.DIRECT
    		proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
    		browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
    
    	def download(self, returnType, url, ensureId, proxyStr = None):
    		if proxyStr:
    			self.setProxy(proxyStr)
    		else:
    			self.rmProxy()
    		self.driver.get(url)
    		# special for xxx.com
    		# your code here
    		# ensure for some element
    		try:
    			WebDriverWait(self.driver, 30).until(EC.presence_of_element_located((By.ID, ensureId)))
    			if returnType == "html":
    				downloadResult = self.driver.page_source
    			elif returnType == "bs":
    				downloadResult = bs(self.driver.page_source, 'lxml')
    			logging("i", "download %s bytes" % len(self.driver.page_source))
    			return downloadResult
    		except Exception,e:
    			logging("e", str(e))
    		finally:
    			self.driver.close()
    
    	def safeDownload(self, returnType, url, ensureId, proxyStr = None):
    		downloadResult = None
    		failTimes = 0
    		while not downloadResult:
    			downloadResult = self.download(returnType, url, ensureId, proxyStr)
    			if not downloadResult:
    				failTimes += 1
    				if failTimes == 5:
    					logging("w", "failed %s times, will abort" % failTimes)
    					break
    				logging("w", "failed %s times, will retry" % failTimes)
    		return downloadResult
    

    元素不可见导致不能操作的错误

    # ElementNotVisibleException: Message: {"errorMessage":"Element is not currently visible and may not be manipulated"
    # Screenshot: available via screen
    

    首先尝试设定窗口大小

    self.driver.set_window_size(1024, 768)
    

    不行的话再尝试滚动页面,如滚动到底部:

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    本文原创发表于http://www.cnblogs.com/qijj,转载请保留此声明。
  • 相关阅读:
    .NET逻辑分层架构总结
    ASP.NET MVC 4 的JS/CSS打包压缩功能-------过滤文件
    c#实现浏览器端大文件分块上传
    fckeditor如何能实现直接粘贴把图片上传到服务器中?
    web编辑器直接粘贴图片实现
    富文本编辑器直接粘贴图片实现
    百度ueditor编辑器直接粘贴图片实现
    百度编辑器直接粘贴图片实现
    fckeditor直接粘贴图片实现
    wangEditor直接粘贴图片实现
  • 原文地址:https://www.cnblogs.com/qijj/p/6225459.html
Copyright © 2020-2023  润新知