一个相对完整的http请求,输入ip和端口,输出响应码,响应头,响应体,是否超时,以及出错时的错误信息
处理包括:
1.协议处理,如果是443用https,其他用http
2.HTTPError处理,HTTPError一般是401,403,404之类的错误,虽然报错,但是也有响应头。注意获取错误信息时要用str(e),其他的比如repr(e)得到的不是字符串,e.read()是响应体,不是错误原因
3.URLError处理,一般是Connection refused之类的错误。注意获取错误信息时要用str(e.reason)
4.响应体gzip解压
5.响应体编码转换
# coding=utf8 import urllib2 import chardet import traceback import StringIO import re import gzip def plugin_homepage(data, timeout): ip = data["ip"] port = data["port"] if port == 443: url = "https://%s:%s/" % (ip, port) else: url = "http://%s:%s/" % (ip, port) is_timeout, error_reason, code, header, body, title = get_html(url, timeout) res = {"ip": ip, "port": port, "rsp_header": header, "rsp_body": body, "code": code, "title": title, "is_timeout": is_timeout, "error_reason": error_reason} return res def get_html(url, timeout): user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} is_timeout = False error_reason = None code = None header = None body = None title = None try: request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request, timeout=timeout) code = response.getcode() body = response.read() header = str(response.headers) except urllib2.HTTPError, e: # 处理http错误 # print "str(e):%s repr(e):%s e:%s e.read():%s " % (str(e), repr(e), e, e.read()) error_reason = str(e) body = e.read() header = e.headers except urllib2.URLError, e: print traceback.print_exc() error_reason = str(e.reason) if error_reason == "timed out": # 判断是否超时 is_timeout = True return is_timeout, error_reason, code, header, body, title except Exception, e: print traceback.print_exc() error_reason = str(e) return is_timeout, error_reason, code, header, body, title if not header: return is_timeout, error_reason, code, header, body, title # 解压gzip if 'Content-Encoding' in header and 'gzip' in header['Content-Encoding']: html_data = StringIO.StringIO(body) gz = gzip.GzipFile(fileobj=html_data) body = gz.read() # 编码转换 try: html_encode = get_encode(header, body).strip() if html_encode and len(html_encode) < 12: body = body.decode(html_encode).encode('utf-8') except: pass # 获取title try: title = re.search(r'<title>(.*?)</title>', body, flags=re.I | re.M) if title: title = title.group(1) except: pass return is_timeout, error_reason, code, str(header), body, title # 获取html编码 def get_encode(header, body): try: m = re.search(r'<meta.*?charset=(.*?)"(>| |/)', body, flags=re.I) if m: return m.group(1).replace('"', '') except: pass try: if 'Content-Type' in header: Content_Type = header['Content-Type'] m = re.search(r'.*?charset=(.*?)(;|$)', Content_Type, flags=re.I) if m: return m.group(1) except: pass chardit1 = chardet.detect(body) encode_method = chardit1['encoding'] return encode_method if __name__ == "__main__": data = {"ip": "127.0.0.1", "port": 80} res = plugin_homepage(data, 3) print res