• 【python】获取http响应


    一个相对完整的http请求,输入ip和端口,输出响应码,响应头,响应体,是否超时,以及出错时的错误信息

    处理包括:

    1.协议处理,如果是443用https,其他用http

    2.HTTPError处理,HTTPError一般是401,403,404之类的错误,虽然报错,但是也有响应头。注意获取错误信息时要用str(e),其他的比如repr(e)得到的不是字符串,e.read()是响应体,不是错误原因

    3.URLError处理,一般是Connection refused之类的错误。注意获取错误信息时要用str(e.reason)

    4.响应体gzip解压

    5.响应体编码转换

    # coding=utf8
    
    import urllib2
    import chardet
    import traceback
    import StringIO
    import re
    import gzip
    
    
    def plugin_homepage(data, timeout):
        ip = data["ip"]
        port = data["port"]
        if port == 443:
            url = "https://%s:%s/" % (ip, port)
        else:
            url = "http://%s:%s/" % (ip, port)
        is_timeout, error_reason, code, header, body, title = get_html(url, timeout)
        res = {"ip": ip,
               "port": port,
               "rsp_header": header,
               "rsp_body": body,
               "code": code,
               "title": title,
               "is_timeout": is_timeout,
               "error_reason": error_reason}
        return res
    
    
    def get_html(url, timeout):
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = {'User-Agent': user_agent}
        is_timeout = False
        error_reason = None
        code = None
        header = None
        body = None
        title = None
        try:
            request = urllib2.Request(url, headers=headers)
            response = urllib2.urlopen(request, timeout=timeout)
            code = response.getcode()
            body = response.read()
            header = str(response.headers)
        except urllib2.HTTPError, e:   # 处理http错误
            # print "str(e):%s
    repr(e):%s
    e:%s
    e.read():%s
    " % (str(e), repr(e), e, e.read())
            error_reason = str(e)
            body = e.read()
            header = e.headers
        except urllib2.URLError, e:
            print traceback.print_exc()
            error_reason = str(e.reason)
            if error_reason == "timed out":  # 判断是否超时
                is_timeout = True
            return is_timeout, error_reason, code, header, body, title
        except Exception, e:
            print traceback.print_exc()
            error_reason = str(e)
            return is_timeout, error_reason, code, header, body, title
        if not header:
            return is_timeout, error_reason, code, header, body, title
        # 解压gzip
        if 'Content-Encoding' in header and 'gzip' in header['Content-Encoding']:
            html_data = StringIO.StringIO(body)
            gz = gzip.GzipFile(fileobj=html_data)
            body = gz.read()
        # 编码转换
        try:
            html_encode = get_encode(header, body).strip()
            if html_encode and len(html_encode) < 12:
                body = body.decode(html_encode).encode('utf-8')
        except:
            pass
        # 获取title
        try:
            title = re.search(r'<title>(.*?)</title>', body, flags=re.I | re.M)
            if title:
                title = title.group(1)
        except:
            pass
        return is_timeout, error_reason, code, str(header), body, title
    
    
    # 获取html编码
    def get_encode(header, body):
        try:
            m = re.search(r'<meta.*?charset=(.*?)"(>| |/)', body, flags=re.I)
            if m:
                return m.group(1).replace('"', '')
        except:
            pass
        try:
            if 'Content-Type' in header:
                Content_Type = header['Content-Type']
                m = re.search(r'.*?charset=(.*?)(;|$)', Content_Type, flags=re.I)
                if m:
                    return m.group(1)
        except:
            pass
        chardit1 = chardet.detect(body)
        encode_method = chardit1['encoding']
        return encode_method
    
    
    if __name__ == "__main__":
        data = {"ip": "127.0.0.1", "port": 80}
        res = plugin_homepage(data, 3)
        print res
  • 相关阅读:
    Gym 100553B Burrito King 无脑背包
    BestCoder Round #85 A B C
    poj 1687 Buggy Sat 简单计算几何
    HDU 1863 Kruskal求最小生成树
    记2016商大ACM省赛
    COMP9517 Week7 Tracking
    COMP9517 week7 Motion
    COMP9313 week7b Spark SQL
    COMP9313 Week 7 Product Quantization and K-Means Clustering
    COMP9517 lab3 image segementation
  • 原文地址:https://www.cnblogs.com/dplearning/p/7641828.html
Copyright © 2020-2023  润新知