- selenium模块在爬虫中的使用 - 概念:是一个基于浏览器自动化的模块。 - 爬虫之间的关联: - 便捷的捕获到动态加载到的数据。(可见即可得) - 实现模拟登陆 - 环境安装:pip install selenium - 基本使用: - 准备好某一款浏览器的驱动程序:http://chromedriver.storage.googleapis.com/index.html - 版本的映射关系:https://blog.csdn.net/huilan_same/article/details/51896672 - 实例化某一款浏览器对象 - 动作链: - 一系列连续的动作 - 在实现标签定位时,如果发现定位的标签是存在于iframe标签之中的,则在定位时必须执行一个 固定的操作:bro.switch_to.frame('id') - 无头浏览器的操作:无可视化界面的浏览器 - PhantomJs:停止更新 - 谷歌无头浏览器 - 让selenium规避检测
自动化京东搜索关键字
from selenium import webdriver from time import sleep bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://www.jd.com/') sleep(1) #进行标签定位 search_input = bro.find_element_by_id('key') search_input.send_keys('mac pro') btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button') btn.click() sleep(2) #执行js bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(2) page_text = bro.page_source print(page_text) sleep(2) bro.quit()
自动化抓取动态加载数据
from selenium import webdriver from time import sleep from lxml import etree bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('http://125.35.6.84:81/xk/') sleep(1) page_text = bro.page_source page_text_list = [page_text] for i in range(3): bro.find_element_by_id('pageIto_next').click()#点击下一页 sleep(1) page_text_list.append(bro.page_source) for page_text in page_text_list: tree = etree.HTML(page_text) li_list = tree.xpath('//ul[@id="gzlist"]/li') for li in li_list: title = li.xpath('./dl/@title')[0] num = li.xpath('./ol/@title')[0] print(title+':'+num) sleep(2) bro.quit()
动作链
from selenium import webdriver from time import sleep from selenium.webdriver import ActionChains bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') bro.switch_to.frame('iframeResult') div_tag = bro.find_element_by_id('draggable') #拖动= 点击+滑动 action = ActionChains(bro) action.click_and_hold(div_tag) for i in range(5): #perform让动作链立即执行 action.move_by_offset(17,5).perform() sleep(0.5) action.release() sleep(3) bro.quit()
12306登录
超级鹰:
import requests from hashlib import md5 class Chaojiying_Client(object): def __init__(self, username, password, soft_id): self.username = username password = password.encode('utf8') self.password = md5(password).hexdigest() self.soft_id = soft_id self.base_params = { 'user': self.username, 'pass2': self.password, 'softid': self.soft_id, } self.headers = { 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } def PostPic(self, im, codetype): """ im: 图片字节 codetype: 题目类型 参考 http://www.chaojiying.com/price.html """ params = { 'codetype': codetype, } params.update(self.base_params) files = {'userfile': ('ccc.jpg', im)} r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers) return r.json() def ReportError(self, im_id): """ im_id:报错题目的图片ID """ params = { 'id': im_id, } params.update(self.base_params) r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) return r.json()
12306自动登录主体代码:
from selenium import webdriver from time import sleep from PIL import Image from selenium.webdriver import ActionChains from Cjy import Chaojiying_Client from selenium.webdriver import ActionChains bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://kyfw.12306.cn/otn/login/init') sleep(5) bro.save_screenshot('main.png') code_img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img') location = code_img_tag.location size = code_img_tag.size #裁剪的区域范围 rangle = (int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height'])) i = Image.open('./main.png') frame = i.crop(rangle) frame.save('code.png') def get_text(imgPath,imgType): chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370') im = open(imgPath, 'rb').read() return chaojiying.PostPic(im, imgType)['pic_str'] #55,70|267,133 ==[[55,70],[33,66]] result = get_text('./code.png',9004) all_list = [] if '|' in result: list_1 = result.split('|') count_1 = len(list_1) for i in range(count_1): xy_list = [] x = int(list_1[i].split(',')[0]) y = int(list_1[i].split(',')[1]) xy_list.append(x) xy_list.append(y) all_list.append(xy_list) else: x = int(result.split(',')[0]) y = int(result.split(',')[1]) xy_list = [] xy_list.append(x) xy_list.append(y) all_list.append(xy_list) print(all_list) # action = ActionChains(bro) for a in all_list: x = a[0] y = a[1] ActionChains(bro).move_to_element_with_offset(code_img_tag,x,y).click().perform() sleep(1) bro.find_element_by_id('username').send_keys('123456') sleep(1) bro.find_element_by_id('password').send_keys('67890000000') sleep(1) bro.find_element_by_id('loginSub').click() sleep(5) bro.quit()
selenium其他操作
#使用谷歌无头浏览器 from selenium import webdriver from time import sleep from selenium.webdriver.chrome.options import Options chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome(r'chromedriver.exe',chrome_options=chrome_options) driver.get('https://www.cnblogs.com/') print(driver.page_source) #如何规避selenium被检测 from selenium import webdriver from selenium.webdriver import ChromeOptions from time import sleep option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) driver = webdriver.Chrome(r'chromedriver.exe',options=option) driver.get('https://www.taobao.com/')