• Pyppeteer 模块


    Pyppeteer 模块:

    安装:

    pip3 install pyppeteer 
    pip install tushare --upgrade3
    

    案例·:

    import asyncio
    from pyppeteer import launch
    from lxml import etree
    
    async def main():
        browser = await launch()  # 新建一个browser对象
        page = await browser.newPage()  # 在浏览器中新建一个选项卡
        await page.goto('http://quotes.toscrape.com/js/')  # 在浏览器中输入URL,相当于selenium里面的get
        page_text = await page.content()  # 使用.content()方法获取页面源码
        tree = etree.HTML(page_text)
        div_list = tree.xpath('//div[@class="quote"]')
        print(len(div_list))
    
        await browser.close()  # 关闭浏览器
    
    asyncio.get_event_loop().run_until_complete(main())
    

    执行js程序:

    import asyncio
    from pyppeteer import launch
    width, height = 1366, 768
    
    async def main():
        browser = await launch(headless=False)
        page = await browser.newPage()
        await page.setViewport({'width': width, 'height': height})
        await page.goto('https://movie.douban.com/typerank?type_name=%E5%8A%A8%E4%BD%9C&type=5&interval_id=100:90&action=')
        await asyncio.sleep(3)
    
        # evaluate可以返回js程序的返回值
        dimensions = await page.evaluate('window.scrollTo(0,document.body.scrollHeight)')
        await asyncio.sleep(3)
        print(dimensions)
        await browser.close()
     
    asyncio.get_event_loop().run_until_complete(main())
    

    避免检查:

    import asyncio
    from pyppeteer import launch
    
    async def main():
        browser = await launch(headless=False, args=['--disable-infobars'])
        page = await browser.newPage()
        await page.goto('https://login.taobao.com/member/login.jhtml?redirectURL=https://www.taobao.com/')
        await page.evaluate( '''() =>{ Object.defineProperties(navigator,{ webdriver:{ get: () => false } }) }''')
        await asyncio.sleep(10)
    
    asyncio.get_event_loop().run_until_complete(main())
    

    UA伪装:

    await self.page.setUserAgent('xxx')
    
    节点交互:
    import asyncio
    from pyppeteer import launch
    async def main():
        # headless参数设为False,则变成有头模式
        browser = await launch(headless=False)
    
        page = await browser.newPage()
        # 设置页面视图大小
        await page.setViewport(viewport={'width': 1280, 'height': 800})
    
        await page.goto('https://www.baidu.com/')
        # 节点交互
        await page.type('#kw','周杰伦',{'delay': 1000})
        await asyncio.sleep(3)
        await page.click('#su')
        await asyncio.sleep(3)
        # 使用选择器选中标签进行点击
        alist = await page.querySelectorAll('.s_tab_inner > a')
        a = alist[3]
        await a.click()
        await asyncio.sleep(3)
        await browser.close()
        
    asyncio.get_event_loop().run_until_complete(main())
    

    爬取头条 /网易:

    import asyncio
    from pyppeteer import launch
    from lxml import etree
    
    async def main():
        # headless参数设为False,则变成有头模式
        browser = await launch(headless=False)
    
        page1 = await browser.newPage()
    
        # 设置页面视图大小
        await page1.setViewport(viewport={'width': 1280, 'height': 800})
    
        await page1.goto('https://www.toutiao.com/')
        await page1.evaluate('window.scrollTo(0,document.body.scrollHeight)')
        await asyncio.sleep(2)
        # 打印页面文本
        page_text = await page1.content()
    
        page2 = await browser.newPage()
        await page2.setViewport(viewport={'width': 1280, 'height': 800})
        await page2.goto('https://news.163.com/domestic/')
        await page2.evaluate('window.scrollTo(0,document.body.scrollHeight)')
        page_text1 = await page2.content()
    
        await browser.close()
    
        return {'wangyi':page_text1,'toutiao':page_text}
    
    def parse(task):
        content_dic = task.result()
        wangyi = content_dic['wangyi']
        toutiao = content_dic['toutiao']
        
        tree = etree.HTML(toutiao)
        a_list = tree.xpath('//div[@class="title-box"]/a')
        print("头条新闻爬取数量: ", len(a_list))
        for a in a_list:
            title = a.xpath('./text()')[0]
            print('toutiao:',title)
    
        tree = etree.HTML(wangyi)
        div_list = tree.xpath('//div[@class="data_row news_article clearfix "]')
        print("网易新闻爬取数量: ", len(div_list))
        for div in div_list:
            title = div.xpath('.//div[@class="news_title"]/h3/a/text()')[0]
            print('wangyi:',title)
     
    tasks = []
    task1 = asyncio.ensure_future(main())
    task1.add_done_callback(parse)
    tasks.append(task1)
    
    asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks))
    
  • 相关阅读:
    flask虚拟环境
    db.Column
    flask_cors跨域请求
    app.config.from_object
    jquery链式原理.html
    swiper轮播
    jquery引用
    animate.html
    设置和获取html里面的内容.html
    jquery获取dom属性方法
  • 原文地址:https://www.cnblogs.com/shaozheng/p/12807686.html
Copyright © 2020-2023  润新知