#coding=utf-8 #urllib操作类 import time import urllib.request import urllib.parse from urllib.error import HTTPError, URLError import sys class myUrllib: @staticmethod def get_headers(headers): default_headers = { 'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36', #'Referer': r'http://www.baidu.com/', 'Connection': 'keep-alive', 'Cookie':'uuid_tt_dd=2845574184150781887; _ga=GA1.2.1608505838; dc_tos=p308' } headers = headers and dict(default_headers,**headers) or default_headers return headers @staticmethod def get(url,headers={}): headers = myUrllib.get_headers(headers) #data=urllib.parse.urlencode(query_data).encode('utf-8') #r/R:非转义的原始字符串 #u/U:表示unicode字符串 #b:bytes url=r'%s'%url request = urllib.request.Request(url,headers=headers,method='GET') try: html = urllib.request.urlopen(request).read() page = html.decode('utf-8') except HTTPError as e: print (e.code,e.reason) except URLError as e: print (e.reason) return page @staticmethod def post(url,data={},headers={}): headers = myUrllib.get_headers(headers) data=urllib.parse.urlencode(data) binary_data=data.encode('utf-8') url=r'%s'%url request=urllib.request.Request(url,data=binary_data,headers=headers,method='POST')#发送请求,传送表单数据 # response=urllib.request.urlopen(request)#接受反馈的信息 # data=response.read()#读取反馈信息 # data=data.decode('utf-8') #print (data.encode('gb18030')) #print (response.geturl())#返回获取的真实的URL #info():返回一个对象,表示远程服务器返回的头信息。 #getcode():返回Http状态码,如果是http请求,200表示请求成功完成;404表示网址未找到。 #geturl():返回请求的url地址。 try: html = urllib.request.urlopen(request).read() page = html.decode('utf-8') except HTTPError as e: print (e.code,e.reason) except URLError as e: print (e.reason) return page getInfo = myUrllib.get('http://localhost:88/test/c.php?act=category',{'Referer': r'https://www.baidu.com/'}) print(getInfo) sys.exit() postInfo = myUrllib.post('http://localhost:88/test/c.php',{'id':1010},{'Referer': r'https://www.baidu.com/'}) print(postInfo)
d:pythoncrawler>python urllib01.py
HTTP_HOST:
localhost:88
HTTP_USER_AGENT:
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/63.0.3239.108 Safari/537.36
HTTP_COOKIE:
uuid_tt_dd=2845574184150781887; _ga=GA1.2.1608505838; dc_tos=p308
HTTP_REFERER:
https://www.baidu.com/
REQUEST_METHOD:
GET
GET DATA:
array(1) {
["act"]=>
string(8) "category"
}
#设置代理
#coding=utf-8 import urllib.request import random from urllib.error import HTTPError, URLError def proxy_handler(url,iplist,wfile): #ip = random.choice(iplist) for ip in iplist: try: print('*'*20,' ip:',ip) proxy_support = urllib.request.ProxyHandler({'http':ip}) opener = urllib.request.build_opener(proxy_support) opener.addheaders = [('User-Agent',r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36')] urllib.request.install_opener(opener) response = urllib.request.urlopen(url) code = response.getcode() url = response.geturl() print('*'*20,' url:',url) print('*'*20,' code:',code) info = response.info() print('*'*20,' info:',info) if code == 200: page = response.read() #写入文件 page = str(page, encoding='utf-8') fw = open(wfile,'w',encoding='UTF-8') fw.write(page) fw.close() print('*'*20,' write file:',wfile) break except HTTPError as e: print (e.code,e.reason) continue except URLError as e: print (e.reason) continue url = r'http://ip.chinaz.com/' iplist = ['182.42.244.169:808','122.72.18.34:80','52.44.16.168:3129'] wfile = 'page.txt' proxy_handler(url,iplist,wfile)d:pythoncrawler>python proxy01.py
********************
ip: 182.42.244.169:808
[WinError 10061] 由于目标计算机积极拒绝,无法连接。
********************
ip: 122.72.18.34:80
********************
url: http://ip.chinaz.com/
********************
code: 200
********************
info: Cache-Control: private
Content-Length: 33900
Content-Type: text/html; charset=utf-8
Server: Microsoft-IIS/7.5
X-AspNet-Version: 4.0.30319
Set-Cookie: qHistory=aHR0cDovL2lwLmNoaW5hei5jb20rSVAv5pyN5Yqh5Zmo5Zyw5Z2A5p+l6K
i; domain=.chinaz.com; expires=Tue, 05-Feb-2019 15:03:42 GMT; path=/
X-Powered-By: ASP.NET
Date: Mon, 05 Feb 2018 15:03:42 GMT
X-Cache: MISS from GD-SZ-WEB-01
X-Cache-Lookup: MISS from GD-SZ-WEB-01:80
Connection: close
********************
write file: page.txt