• python爬虫之urllib


    #coding=utf-8    
    #urllib操作类  
    
    import time
    import urllib.request
    import urllib.parse
    from urllib.error import HTTPError, URLError
    import sys
    class myUrllib:
    
    	@staticmethod
    	def get_headers(headers):
    		default_headers = {
    			'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
    			#'Referer': r'http://www.baidu.com/',
    			'Connection': 'keep-alive',
    			'Cookie':'uuid_tt_dd=2845574184150781887; _ga=GA1.2.1608505838; dc_tos=p308'
    		}
    		headers = headers and dict(default_headers,**headers) or default_headers  
    		return headers
    
    
    	@staticmethod
    	def get(url,headers={}):
    		headers = myUrllib.get_headers(headers)
    		#data=urllib.parse.urlencode(query_data).encode('utf-8')
    		#r/R:非转义的原始字符串 
    		#u/U:表示unicode字符串 
    		#b:bytes 
    		url=r'%s'%url
    		request = urllib.request.Request(url,headers=headers,method='GET')
    		try:
    			html = urllib.request.urlopen(request).read()
    			page = html.decode('utf-8')
    		except HTTPError as e:
    			print (e.code,e.reason)
    		except URLError as e:
    			print (e.reason)
    		return page
    
    	@staticmethod
    	def post(url,data={},headers={}):
    		headers = myUrllib.get_headers(headers)
    		data=urllib.parse.urlencode(data)
    		binary_data=data.encode('utf-8')
    		url=r'%s'%url
    		request=urllib.request.Request(url,data=binary_data,headers=headers,method='POST')#发送请求,传送表单数据    
    		# response=urllib.request.urlopen(request)#接受反馈的信息
    		# data=response.read()#读取反馈信息
    		# data=data.decode('utf-8')
    		#print (data.encode('gb18030'))
    		#print (response.geturl())#返回获取的真实的URL
    		#info():返回一个对象,表示远程服务器返回的头信息。
    		#getcode():返回Http状态码,如果是http请求,200表示请求成功完成;404表示网址未找到。
    		#geturl():返回请求的url地址。
    		
    		try:
    			html = urllib.request.urlopen(request).read()
    			page = html.decode('utf-8')
    		except HTTPError as e:
    			print (e.code,e.reason)
    		except URLError as e:
    			print (e.reason)
    		return page
    
    		
    
    getInfo = myUrllib.get('http://localhost:88/test/c.php?act=category',{'Referer': r'https://www.baidu.com/'})
    print(getInfo)
    
    sys.exit() 
    
    postInfo = myUrllib.post('http://localhost:88/test/c.php',{'id':1010},{'Referer': r'https://www.baidu.com/'})
    print(postInfo)


    d:pythoncrawler>python urllib01.py
    HTTP_HOST:
     localhost:88


    HTTP_USER_AGENT:
     Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
     Chrome/63.0.3239.108 Safari/537.36


    HTTP_COOKIE:
     uuid_tt_dd=2845574184150781887; _ga=GA1.2.1608505838; dc_tos=p308


    HTTP_REFERER:
     https://www.baidu.com/


    REQUEST_METHOD:
     GET


    GET DATA:
    array(1) {
      ["act"]=>
      string(8) "category"
    }

    #设置代理

    #coding=utf-8
    import urllib.request
    import random
    from urllib.error import HTTPError, URLError
    
    def proxy_handler(url,iplist,wfile):
    	#ip = random.choice(iplist)
    	for ip in iplist:
    		try:
    			print('*'*20,'
     ip:',ip)
    			proxy_support = urllib.request.ProxyHandler({'http':ip})
    			opener = urllib.request.build_opener(proxy_support)
    			opener.addheaders = [('User-Agent',r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36')]
    			urllib.request.install_opener(opener)
    			response = urllib.request.urlopen(url)
    			code = response.getcode()
    			url = response.geturl()
    			print('*'*20,'
     url:',url)
    			print('*'*20,'
     code:',code)
    			info = response.info()
    			print('*'*20,'
     info:',info)
    			if code == 200:
    				page = response.read()
    				#写入文件
    				page = str(page, encoding='utf-8')
    				fw = open(wfile,'w',encoding='UTF-8')
    				fw.write(page)
    				fw.close()
    				print('*'*20,'
     write file:',wfile)
    				break
    		except HTTPError as e:
    			print (e.code,e.reason)
    			continue
    		except URLError as e:
    			print (e.reason)
    			continue
    	
    
    url = r'http://ip.chinaz.com/'
    iplist = ['182.42.244.169:808','122.72.18.34:80','52.44.16.168:3129']
    wfile = 'page.txt'
    proxy_handler(url,iplist,wfile)
    d:pythoncrawler>python proxy01.py
    ********************
     ip: 182.42.244.169:808
    [WinError 10061] 由于目标计算机积极拒绝,无法连接。
    ********************
     ip: 122.72.18.34:80
    ********************
     url: http://ip.chinaz.com/
    ********************
     code: 200
    ********************
     info: Cache-Control: private
    Content-Length: 33900
    Content-Type: text/html; charset=utf-8
    Server: Microsoft-IIS/7.5
    X-AspNet-Version: 4.0.30319
    Set-Cookie: qHistory=aHR0cDovL2lwLmNoaW5hei5jb20rSVAv5pyN5Yqh5Zmo5Zyw5Z2A5p+l6K
    i; domain=.chinaz.com; expires=Tue, 05-Feb-2019 15:03:42 GMT; path=/
    X-Powered-By: ASP.NET
    Date: Mon, 05 Feb 2018 15:03:42 GMT
    X-Cache: MISS from GD-SZ-WEB-01
    X-Cache-Lookup: MISS from GD-SZ-WEB-01:80
    Connection: close

    ********************
     write file: page.txt



  • 相关阅读:
    递归 迷宫问题
    中缀表达式转后缀表达式
    栈实现后缀表达式计算

    单向环形链表
    站在巨人的肩上
    C#自宿主API,不依赖IIS
    MySQL 安装失败解决办法
    第一话
    村上春树《眠》读书笔记
  • 原文地址:https://www.cnblogs.com/fonyer/p/8871447.html
Copyright © 2020-2023  润新知