• 使用mitmproxy做今日头条爬虫链接分析


    import pickle
    
    import chardet
    from mitmproxy import ctx
    from pprint import pprint
    
    heads_file = 'header.txt'
    
    body_file = 'body.txt'
    
    #mitmdump -s test.py
    # Dalvik/2.1.0 (Linux; U; Android 8.1.0; MI 8 MIUI/8.8.31)
    def request(flow):
         #只是修改请求浏览器请求头为MitmProxy
         # flow.request.headers['User-Agent'] = 'Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; MI 5s Build/MXB48T) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/8.7.1'
         # ctx.log.warn(str(flow.request.url))
         # ctx.log.info(str(flow.request.headers))
         # pprint(vars(flow.request))
         # ctx.log.error(str(dir(flow.request)))
         # ctx.log.info("data.content:" + str(flow.request.data.content))
         # ctx.log.info("data:" + str(dir(flow.request.data)))
         # ctx.log.info("content:" + str(flow.request.content))
         # ctx.log.info(flow.request.headers['User-Agent'])
         url = str(flow.request.url)
         ctx.log.info("url:" + url)
         # if 'pstatp.com/article' in url or 'snssdk.com/article' in url or 'snssdk.com/api/search' in url:
         #      file = open(heads_file, encoding="utf-8", mode="a")
         #      file.write( url + "
    ")
         #      file.close()
         fileother = open("other.txt", encoding="utf-8", mode="a")
         fileother.write(url + "
    ")
         fileother.close()
         # with open(heads_file, 'a') as handle:
         #      pickle.dump(flow.request.url, handle)
    
    
    # def response(flow):
    #      response = flow.response
    #      info = ctx.log.info
    #      info(str(response.status_code))
    #      info(str(response.headers))
    #      info(str(response.cookies))
    #      # info(str(response.encoding))
    #      detRes = chardet.detect(response.content)  # 返回编码结果
    #      charset = detRes["encoding"]
    #      info(str(charset))
    #      # text = response.content.decode(charset, "ignore")
    #      if not charset:
    #           charset = 'utf-8'
    #      text = str(response.content,encoding=charset)
    #      info(text)
    #      file = open(body_file,encoding=charset,mode="a")
    #      file.write(text)
    #      file.close()
         # with open(body_file, 'a') as handle:
         #      pickle.dump(text, handle)
  • 相关阅读:
    idea连接数据库和版本控制(Version Control)
    Idea新手入门-部署tomcat
    Redis 列表(List)
    Redis 集合(Set)
    Redis中的哈希(Hash)
    Redis初步整理
    C#中的集合之ArryList
    linux中pip安装步骤与使用详解
    搭建 LAMP 环境
    搭建WordPress 个人博客
  • 原文地址:https://www.cnblogs.com/procedureMonkey/p/10320322.html
Copyright © 2020-2023  润新知