• 通过pyppeteer 库获取请求的携带的相关参数


    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    #--author: Baozi
    
    import asyncio
    from pyppeteer import launch
    import time
    import re
    
    
    url_params = ''
    doc_id = ''
    async def intercept_response(res):
        global url_params
        if '__dyn' in res.url and 'https://www.facebook.com/ajax/bz' in res.url and not url_params:
            url_params = res.url
            print(url_params)
    
    
    async def request_check(req):
        '''请求过滤'''
        if req.resourceType in ['image', 'media', 'eventsource', 'websocket']:
            await req.abort()
        else:
            await req.continue_()
    
    async def main(url,proxy,ua):
        browser = await launch({'headless': False, 'args': [ '--proxy-server={}'.format(proxy),'--disable-infobars'] })  # 启动pyppeteer 属于内存中实现交互的模拟器
        time.sleep(10)
        page = await browser.newPage()
        page.on('request', intercept_response)
    
        # 设置请求头userAgent
        await page.setUserAgent(ua)
        await page.goto(url,{'timeout': 1000*20})
        global url_params
        for i in range(3):
            if not url_params:
                time.sleep(10)
                await page.goto(url, {'timeout': 1000 * 20})
    
        comment_click = await page.xpath('//form[@rel="async"]//div[@class="_4vn1"]/span[@class="_4vn2"]/a')
        await comment_click[0].click()
        time.sleep(2.5)
        await browser.close()
    
    def get_url(url,proxy,user_agent):
    
        global url_params
        pyputeer_params = {}
        try:
            asyncio.get_event_loop().run_until_complete(main(url,proxy,user_agent))
        except Exception as e:
            pass
        url_params = url_params + '&'
        pyputeer_params['__user'] = '0'
        pyputeer_params['__a'] = '1'
        pyputeer_params['__dyn'] = re.findall('__dyn=(.*?)&', url_params)[0]
        pyputeer_params['__csr'] = re.findall('__csr=(.*?)&', url_params)[0]
        pyputeer_params['__req'] = re.findall('__req=(.*?)&', url_params)[0]
        pyputeer_params['__beoa'] = re.findall('__beoa=(.*?)&', url_params)[0]
        pyputeer_params['__pc'] = re.findall('__pc=(.*?)&', url_params)[0]
        pyputeer_params['dpr'] = re.findall('dpr=(.*?)&', url_params)[0]
        pyputeer_params['__ccg'] = re.findall('dpr=(.*?)&', url_params)[0]
        pyputeer_params['__rev'] = re.findall('__rev=(.*?)&', url_params)[0]
        pyputeer_params['__s'] = re.findall('__s=(.*?)&', url_params)[0]
        pyputeer_params['__hsi'] = re.findall('__hsi=(.*?)&', url_params)[0]
        pyputeer_params['__comet_req'] = re.findall('__comet_req=(.*?)&', url_params)[0]
        pyputeer_params['lsd'] = re.findall('lsd=(.*?)&', url_params)[0]
        pyputeer_params['jazoest'] = re.findall('jazoest=(.*?)&', url_params)[0]
        pyputeer_params['__spin_r'] = re.findall('__spin_r=(.*?)&', url_params)[0]
        pyputeer_params['__spin_b'] = re.findall('__spin_b=(.*?)&', url_params)[0]
        pyputeer_params['__spin_t'] = re.findall('__spin_t=(.*?)&', url_params)[0]
    
        return pyputeer_params
    
    if __name__ == '__main__':
    
        url = 'https://www.facebook.com/news.hkcd/posts/2966706433454938'
        proxy = 'http://172.16.7.14:13512'
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3965.0 Safari/537.36'
        print(get_url(url,proxy,user_agent))
    做一枚奔跑的老少年!
  • 相关阅读:
    构造函数的继承
    创建一个不被销毁的空间 闭包小应用
    如何在Linux上恢复误删除的文件或目录
    一文详解 Ansible 自动化运维
    Shell 脚本编程最佳实践
    10 分钟看懂 Docker 和 K8S!
    BGP路由协议详解(完整版)
    浅析 Linux 中的零拷贝技术
    2020年DevOps工程师入门指南
    一条更新的SQL如何执行
  • 原文地址:https://www.cnblogs.com/xiaoshayu520ly/p/13543210.html
Copyright © 2020-2023  润新知