用urllib2抓取被限制的网站页面
-
# coding:utf-8 import urllib2 url = "http://blog.csdn.net/troubleshooter" html = urllib2.urlopen(url) print html.read()
返回403错误
- 模拟用户访问
# coding:utf-8 import urllib2 url = "http://blog.csdn.net/troubleshooter" url_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36', 'Referer':'http://www.cnblogs.com/evilxr/p/4038902.html', 'Host':'blog.csdn.net', 'GET':url } req = urllib2.Request(url, headers=url_headers) html = urllib2.urlopen(req) print html.getcode()
200 [Finished in 0.4s]
-
获取Cookie信息
import urllib2 import cookielib cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) response = opener.open('http://www.baidu.com') In [12]: for i in cookie: print i.name,i.value ....: BAIDUID 4722B044786BAE8B1E484C0535706271:FG=1 BIDUPSID 4722B044786BAE8B1E484C0535706271 H_PS_PSSID 10299_16540_1430_16474_12824_10812_12868_14669_16520_16326_16662_16424_16514_15050_12386_13932 PSTM 1438398244 BDSVRTM 0 BD_HOME 0
-
打开调试功能
import urllib2 httpHandler = urllib2.HTTPHandler(debuglevel=1) httpsHandler = urllib2.HTTPSHandler(debuglevel=1) opener = urllib2.build_opener(httpHandler, httpsHandler) urllib2.install_opener(opener) response = urllib2.urlopen('http://www.baidu.com') response = urllib2.urlopen('http://www.baidu.com') send: 'GET / HTTP/1.1 Accept-Encoding: identity Host: www.baidu.com Connection: close User-Agent: Python-urllib/2.7 ' reply: 'HTTP/1.1 200 OK ' header: Date: Sat, 01 Aug 2014 03:14:07 GMT header: Content-Type: text/html; charset=utf-8 header: Transfer-Encoding: chunked header: Connection: Close header: Vary: Accept-Encoding header: Set-Cookie: BAIDUID=0E3FD673DED07D3DBB4D6048AB469A32:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com header: Set-Cookie: BIDUPSID=0E3FD673DED07D3DBB4D6048AB469A32; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com header: Set-Cookie: PSTM=1438398847; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com header: Set-Cookie: BDSVRTM=0; path=/ header: Set-Cookie: BD_HOME=0; path=/ header: Set-Cookie: H_PS_PSSID=13289_1441_10813_14432_12867_14667_16521_14951_16663_16427_16514_15291_12315_13932_10634; path=/; domain=.baidu.com header: P3P: CP=" OTI DSP COR IVA OUR IND COM " header: Cache-Control: private header: Cxy_all: baidu+d4d7821ea11368a1cad938a4de84b7ab header: Expires: Sat, 01 Aug 2015 03:13:12 GMT header: X-Powered-By: HPHP header: Server: BWS/1.1 header: X-UA-Compatible: IE=Edge,chrome=1 header: BDPAGETYPE: 1 header: BDQID: 0x8824b3dc0001bdbb header: BDUSERID: 0