• python爬虫之趟雷


    python爬虫之趟雷整理

    雷一:URLError

      问题具体描述:urllib.error.URLError: <urlopen error [Errno 11004] getaddrinfo failed

     1 import urllib.request
     2 
     3 
     4 def load_message():
     5     url = 'http://www.baidu.com'
     6 
     7     request = urllib.request.Request(url)
     8     response = urllib.request.urlopen(request)
     9     response_str = response.read().decode('utf-8')
    10 
    11     return response.headers, request.headers, response_str
    12 
    13 
    14 response_header, request_header, response_data = load_message()
    15 print(request_header)
    16 print('----------------------------------------')
    17 print(response_header)
    18 print('----------------------------------------')
    19 print(response_data)
    View Code

      分析:报错原因为URLError,产生原因为URL,简单来说,就是URL资源无法访问或者访问不了。具体问题出在三个方向,URL本身,客户端,服务器。

      解决办法:第一点,检查URL书写是否正确;第二点,检查客户端网络连接状态;第三点,使用URL在浏览器地址栏访问验证服务器是否存在。

      问题具体描述:urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)>

     1 #!/usr/bin/env python
     2 # -*- coding=utf-8 -*-
     3 # Author: Snow
     4 
     5 
     6 import urllib.request
     7 
     8 
     9 def create_cookie():
    10     url = 'https://www.yaozh.com/member/'
    11     headers = {
    12         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko
    13 Chrome/69.0.3497.92 Safari/537.36',
    14         'Cookie': 'think_language=zh-CN; _ga=GA1.2.179792116.1550119571; _gat=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; _gid=GA1.2.111857803.1550119581; yaozh_logintime=1550119751; yaozh_user=692948%09snown_1; yaozh_userId=692948; yaozh_uidhas=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; MEIQIA_VISIT_ID=1H9g97Ef1WpjYsWf4b7UlGe3wel; PHPSESSID=5itl5rejqnekb07bfrtmuvr3l6; yaozh_mylogin=1550196658; MEIQIA_VISIT_ID=1HCCOYdyjR0FalzMfFm4vYsqevT; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1550119570%2C1550119584%2C1550119751%2C1550196659; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1550196663'
    15     }
    16 
    17     request = urllib.request.Request(url, headers=headers)
    18     response = urllib.request.urlopen(request)
    19     response_data = response.read().decode('utf-8')
    20 
    21     return response_data
    22 
    23 
    24 result = create_cookie()
    25 with open('cookies.html', 'w', encoding='utf-8') as f:
    26     f.write(result)
    View Code

      分析:问题产生原因python使用urllib.request,urlopen()打开https链接时,需要验证SSL证书,如果网站使用自签名的证书会抛出异常。

      解决办法:第一点,使用SSL创建context验证上下文,传入urlopen()中context上下文参数;第二点,取消证书验证。

     1 #!/usr/bin/env python
     2 # -*- coding=utf-8 -*-
     3 # Author: Snow
     4 
     5 
     6 import urllib.request
     7 import ssl    #导入ssl模块
     8 
     9 
    10 def create_cookie():
    11     url = 'https://www.yaozh.com/member/'
    12     headers = {
    13         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko
    14 Chrome/69.0.3497.92 Safari/537.36',
    15         'Cookie': 'think_language=zh-CN; _ga=GA1.2.179792116.1550119571; _gat=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; _gid=GA1.2.111857803.1550119581; yaozh_logintime=1550119751; yaozh_user=692948%09snown_1; yaozh_userId=692948; yaozh_uidhas=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; MEIQIA_VISIT_ID=1H9g97Ef1WpjYsWf4b7UlGe3wel; PHPSESSID=5itl5rejqnekb07bfrtmuvr3l6; yaozh_mylogin=1550196658; MEIQIA_VISIT_ID=1HCCOYdyjR0FalzMfFm4vYsqevT; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1550119570%2C1550119584%2C1550119751%2C1550196659; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1550196663'
    16     }
    17 
    18     context = ssl._create_unverified_context()  # 创建验证SSL上下文
    19 
    20     request = urllib.request.Request(url, headers=headers)
    21     response = urllib.request.urlopen(request, context=context)  # 传入context参数
    22     response_data = response.read().decode('utf-8')
    23 
    24     return response_data
    25 
    26 
    27 result = create_cookie()
    28 with open('cookies.html', 'w', encoding='utf-8') as f:
    29     f.write(result)
    View Code
     1 #!/usr/bin/env python
     2 # -*- coding=utf-8 -*-
     3 # Author: Snow
     4 
     5 
     6 import urllib.request
     7 import ssl
     8 
     9 
    10 def create_cookie():
    11     url = 'https://www.yaozh.com/member/'
    12     headers = {
    13         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko
    14 Chrome/69.0.3497.92 Safari/537.36',
    15         'Cookie': 'think_language=zh-CN; _ga=GA1.2.179792116.1550119571; _gat=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; _gid=GA1.2.111857803.1550119581; yaozh_logintime=1550119751; yaozh_user=692948%09snown_1; yaozh_userId=692948; yaozh_uidhas=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; MEIQIA_VISIT_ID=1H9g97Ef1WpjYsWf4b7UlGe3wel; PHPSESSID=5itl5rejqnekb07bfrtmuvr3l6; yaozh_mylogin=1550196658; MEIQIA_VISIT_ID=1HCCOYdyjR0FalzMfFm4vYsqevT; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1550119570%2C1550119584%2C1550119751%2C1550196659; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1550196663'
    16     }
    17 
    18     ssl._create_default_https_context = ssl._create_unverified_context  # 缺省context参数不做验证,取消验证ssl证书
    19 
    20     request = urllib.request.Request(url, headers=headers)
    21     response = urllib.request.urlopen(request)
    22     response_data = response.read().decode('utf-8')
    23 
    24     return response_data
    25 
    26 
    27 result = create_cookie()
    28 with open('cookies.html', 'w', encoding='utf-8') as f:
    29     f.write(result)
    View Code

    雷二:HTTPError

      问题具体描述:urllib.error.HTTPError: HTTP Error 503: Service Temporarily Unavailable

     1 #!/usr/bin/env python
     2 # -*- coding=utf-8 -*-
     3 # Author: Snow
     4 
     5 import urllib.request
     6 
     7 
     8 def fee_proxy():
     9     url = 'https://www.xicidaili.com/nn/'
    10 
    11     # 付费代理IP第一种方式
    12     # proxy_1 = {
    13     #     'http': 'user_name:passswor@121.61.1.222:9999'
    14     # }
    15 
    16     # 付费代理IP第二种方式
    17     user_name = 'admin'
    18     password = '123456'
    19     proxy_ip = '121.61.1.222:9999'
    20     proxy_manage = urllib.request.HTTPPasswordMgrWithDefaultRealm()  # 密码管理器
    21     proxy_manage.add_password(None, proxy_ip, user_name, password)
    22 
    23     # proxy_handler = urllib.request.ProxyHandler(proxy_1)
    24     proxy_handler = urllib.request.ProxyBasicAuthHandler(proxy_manage)  # 代理IP验证处理器
    25     proxy_openner = urllib.request.build_opener(proxy_handler)
    26 
    27     response = proxy_openner.open(url)
    28     response_str = response.read().decode('utf-8')
    29 
    30     return response_str
    31 
    32 
    33 data = fee_proxy()
    34 print(data)
    View Code

      分析

      解决办法

  • 相关阅读:
    cogs 775. 山海经
    [HZOI 2016][Tyvj 1729]文艺平衡树 这道题我真是哭了,调了一下午,一晚上
    几种平衡树
    bzoj1124 [POI2008]枪战Maf
    [Usaco2007 Open]Fliptile 翻格子游戏
    团队冲刺08
    团队冲刺07
    团队冲刺06
    团队冲刺05
    团队冲刺04
  • 原文地址:https://www.cnblogs.com/snow-lanuage/p/10361844.html
Copyright © 2020-2023  润新知