python爬虫之趟雷整理
雷一:URLError
问题具体描述:urllib.error.URLError: <urlopen error [Errno 11004] getaddrinfo failed
1 import urllib.request 2 3 4 def load_message(): 5 url = 'http://www.baidu.com' 6 7 request = urllib.request.Request(url) 8 response = urllib.request.urlopen(request) 9 response_str = response.read().decode('utf-8') 10 11 return response.headers, request.headers, response_str 12 13 14 response_header, request_header, response_data = load_message() 15 print(request_header) 16 print('----------------------------------------') 17 print(response_header) 18 print('----------------------------------------') 19 print(response_data)
分析:报错原因为URLError,产生原因为URL,简单来说,就是URL资源无法访问或者访问不了。具体问题出在三个方向,URL本身,客户端,服务器。
解决办法:第一点,检查URL书写是否正确;第二点,检查客户端网络连接状态;第三点,使用URL在浏览器地址栏访问验证服务器是否存在。
问题具体描述:urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)>
1 #!/usr/bin/env python 2 # -*- coding=utf-8 -*- 3 # Author: Snow 4 5 6 import urllib.request 7 8 9 def create_cookie(): 10 url = 'https://www.yaozh.com/member/' 11 headers = { 12 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko 13 Chrome/69.0.3497.92 Safari/537.36', 14 'Cookie': 'think_language=zh-CN; _ga=GA1.2.179792116.1550119571; _gat=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; _gid=GA1.2.111857803.1550119581; yaozh_logintime=1550119751; yaozh_user=692948%09snown_1; yaozh_userId=692948; yaozh_uidhas=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; MEIQIA_VISIT_ID=1H9g97Ef1WpjYsWf4b7UlGe3wel; PHPSESSID=5itl5rejqnekb07bfrtmuvr3l6; yaozh_mylogin=1550196658; MEIQIA_VISIT_ID=1HCCOYdyjR0FalzMfFm4vYsqevT; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1550119570%2C1550119584%2C1550119751%2C1550196659; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1550196663' 15 } 16 17 request = urllib.request.Request(url, headers=headers) 18 response = urllib.request.urlopen(request) 19 response_data = response.read().decode('utf-8') 20 21 return response_data 22 23 24 result = create_cookie() 25 with open('cookies.html', 'w', encoding='utf-8') as f: 26 f.write(result)
分析:问题产生原因python使用urllib.request,urlopen()打开https链接时,需要验证SSL证书,如果网站使用自签名的证书会抛出异常。
解决办法:第一点,使用SSL创建context验证上下文,传入urlopen()中context上下文参数;第二点,取消证书验证。
1 #!/usr/bin/env python 2 # -*- coding=utf-8 -*- 3 # Author: Snow 4 5 6 import urllib.request 7 import ssl #导入ssl模块 8 9 10 def create_cookie(): 11 url = 'https://www.yaozh.com/member/' 12 headers = { 13 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko 14 Chrome/69.0.3497.92 Safari/537.36', 15 'Cookie': 'think_language=zh-CN; _ga=GA1.2.179792116.1550119571; _gat=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; _gid=GA1.2.111857803.1550119581; yaozh_logintime=1550119751; yaozh_user=692948%09snown_1; yaozh_userId=692948; yaozh_uidhas=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; MEIQIA_VISIT_ID=1H9g97Ef1WpjYsWf4b7UlGe3wel; PHPSESSID=5itl5rejqnekb07bfrtmuvr3l6; yaozh_mylogin=1550196658; MEIQIA_VISIT_ID=1HCCOYdyjR0FalzMfFm4vYsqevT; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1550119570%2C1550119584%2C1550119751%2C1550196659; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1550196663' 16 } 17 18 context = ssl._create_unverified_context() # 创建验证SSL上下文 19 20 request = urllib.request.Request(url, headers=headers) 21 response = urllib.request.urlopen(request, context=context) # 传入context参数 22 response_data = response.read().decode('utf-8') 23 24 return response_data 25 26 27 result = create_cookie() 28 with open('cookies.html', 'w', encoding='utf-8') as f: 29 f.write(result)
1 #!/usr/bin/env python 2 # -*- coding=utf-8 -*- 3 # Author: Snow 4 5 6 import urllib.request 7 import ssl 8 9 10 def create_cookie(): 11 url = 'https://www.yaozh.com/member/' 12 headers = { 13 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko 14 Chrome/69.0.3497.92 Safari/537.36', 15 'Cookie': 'think_language=zh-CN; _ga=GA1.2.179792116.1550119571; _gat=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; _gid=GA1.2.111857803.1550119581; yaozh_logintime=1550119751; yaozh_user=692948%09snown_1; yaozh_userId=692948; yaozh_uidhas=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; MEIQIA_VISIT_ID=1H9g97Ef1WpjYsWf4b7UlGe3wel; PHPSESSID=5itl5rejqnekb07bfrtmuvr3l6; yaozh_mylogin=1550196658; MEIQIA_VISIT_ID=1HCCOYdyjR0FalzMfFm4vYsqevT; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1550119570%2C1550119584%2C1550119751%2C1550196659; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1550196663' 16 } 17 18 ssl._create_default_https_context = ssl._create_unverified_context # 缺省context参数不做验证,取消验证ssl证书 19 20 request = urllib.request.Request(url, headers=headers) 21 response = urllib.request.urlopen(request) 22 response_data = response.read().decode('utf-8') 23 24 return response_data 25 26 27 result = create_cookie() 28 with open('cookies.html', 'w', encoding='utf-8') as f: 29 f.write(result)
雷二:HTTPError
问题具体描述:urllib.error.HTTPError: HTTP Error 503: Service Temporarily Unavailable
1 #!/usr/bin/env python 2 # -*- coding=utf-8 -*- 3 # Author: Snow 4 5 import urllib.request 6 7 8 def fee_proxy(): 9 url = 'https://www.xicidaili.com/nn/' 10 11 # 付费代理IP第一种方式 12 # proxy_1 = { 13 # 'http': 'user_name:passswor@121.61.1.222:9999' 14 # } 15 16 # 付费代理IP第二种方式 17 user_name = 'admin' 18 password = '123456' 19 proxy_ip = '121.61.1.222:9999' 20 proxy_manage = urllib.request.HTTPPasswordMgrWithDefaultRealm() # 密码管理器 21 proxy_manage.add_password(None, proxy_ip, user_name, password) 22 23 # proxy_handler = urllib.request.ProxyHandler(proxy_1) 24 proxy_handler = urllib.request.ProxyBasicAuthHandler(proxy_manage) # 代理IP验证处理器 25 proxy_openner = urllib.request.build_opener(proxy_handler) 26 27 response = proxy_openner.open(url) 28 response_str = response.read().decode('utf-8') 29 30 return response_str 31 32 33 data = fee_proxy() 34 print(data)
分析:
解决办法: