Pyppeteer 模块:
安装:
pip3 install pyppeteer
pip install tushare --upgrade3
案例·:
import asyncio
from pyppeteer import launch
from lxml import etree
async def main():
browser = await launch() # 新建一个browser对象
page = await browser.newPage() # 在浏览器中新建一个选项卡
await page.goto('http://quotes.toscrape.com/js/') # 在浏览器中输入URL,相当于selenium里面的get
page_text = await page.content() # 使用.content()方法获取页面源码
tree = etree.HTML(page_text)
div_list = tree.xpath('//div[@class="quote"]')
print(len(div_list))
await browser.close() # 关闭浏览器
asyncio.get_event_loop().run_until_complete(main())
执行js程序:
import asyncio
from pyppeteer import launch
width, height = 1366, 768
async def main():
browser = await launch(headless=False)
page = await browser.newPage()
await page.setViewport({'width': width, 'height': height})
await page.goto('https://movie.douban.com/typerank?type_name=%E5%8A%A8%E4%BD%9C&type=5&interval_id=100:90&action=')
await asyncio.sleep(3)
# evaluate可以返回js程序的返回值
dimensions = await page.evaluate('window.scrollTo(0,document.body.scrollHeight)')
await asyncio.sleep(3)
print(dimensions)
await browser.close()
asyncio.get_event_loop().run_until_complete(main())
避免检查:
import asyncio
from pyppeteer import launch
async def main():
browser = await launch(headless=False, args=['--disable-infobars'])
page = await browser.newPage()
await page.goto('https://login.taobao.com/member/login.jhtml?redirectURL=https://www.taobao.com/')
await page.evaluate( '''() =>{ Object.defineProperties(navigator,{ webdriver:{ get: () => false } }) }''')
await asyncio.sleep(10)
asyncio.get_event_loop().run_until_complete(main())
UA伪装:
await self.page.setUserAgent('xxx')
节点交互:
import asyncio
from pyppeteer import launch
async def main():
# headless参数设为False,则变成有头模式
browser = await launch(headless=False)
page = await browser.newPage()
# 设置页面视图大小
await page.setViewport(viewport={'width': 1280, 'height': 800})
await page.goto('https://www.baidu.com/')
# 节点交互
await page.type('#kw','周杰伦',{'delay': 1000})
await asyncio.sleep(3)
await page.click('#su')
await asyncio.sleep(3)
# 使用选择器选中标签进行点击
alist = await page.querySelectorAll('.s_tab_inner > a')
a = alist[3]
await a.click()
await asyncio.sleep(3)
await browser.close()
asyncio.get_event_loop().run_until_complete(main())
爬取头条 /网易:
import asyncio
from pyppeteer import launch
from lxml import etree
async def main():
# headless参数设为False,则变成有头模式
browser = await launch(headless=False)
page1 = await browser.newPage()
# 设置页面视图大小
await page1.setViewport(viewport={'width': 1280, 'height': 800})
await page1.goto('https://www.toutiao.com/')
await page1.evaluate('window.scrollTo(0,document.body.scrollHeight)')
await asyncio.sleep(2)
# 打印页面文本
page_text = await page1.content()
page2 = await browser.newPage()
await page2.setViewport(viewport={'width': 1280, 'height': 800})
await page2.goto('https://news.163.com/domestic/')
await page2.evaluate('window.scrollTo(0,document.body.scrollHeight)')
page_text1 = await page2.content()
await browser.close()
return {'wangyi':page_text1,'toutiao':page_text}
def parse(task):
content_dic = task.result()
wangyi = content_dic['wangyi']
toutiao = content_dic['toutiao']
tree = etree.HTML(toutiao)
a_list = tree.xpath('//div[@class="title-box"]/a')
print("头条新闻爬取数量: ", len(a_list))
for a in a_list:
title = a.xpath('./text()')[0]
print('toutiao:',title)
tree = etree.HTML(wangyi)
div_list = tree.xpath('//div[@class="data_row news_article clearfix "]')
print("网易新闻爬取数量: ", len(div_list))
for div in div_list:
title = div.xpath('.//div[@class="news_title"]/h3/a/text()')[0]
print('wangyi:',title)
tasks = []
task1 = asyncio.ensure_future(main())
task1.add_done_callback(parse)
tasks.append(task1)
asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks))