• Step by Step of "Web scraping with Python" ----Richard Lawson ---3/n


    when trying the sample code of "link_crawler3.py", it will always fail with below message:

    /usr/bin/python3 /home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py
    Downloading:http://example.webscraping.com
    Downloading--2
    Downloading:http://example.webscraping.com
    Downloading --- 5
    {'User-agent': {'User-agent': 'GoodCrawler'}}
    http://example.webscraping.com
    Traceback (most recent call last):
      File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 150, in <module>
        link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
      File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 36, in link_crawler
        html = download(url, headers, proxy=proxy, num_retries=num_retries)
      File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/common.py", line 75, in download5
        htmlrsp = opener.open(requestnew)
      File "/usr/lib/python3.5/urllib/request.py", line 466, in open
        response = self._open(req, data)
      File "/usr/lib/python3.5/urllib/request.py", line 484, in _open
        '_open', req)
      File "/usr/lib/python3.5/urllib/request.py", line 444, in _call_chain
        result = func(*args)
      File "/usr/lib/python3.5/urllib/request.py", line 1282, in http_open
        return self.do_open(http.client.HTTPConnection, req)
      File "/usr/lib/python3.5/urllib/request.py", line 1254, in do_open
        h.request(req.get_method(), req.selector, req.data, headers)
      File "/usr/lib/python3.5/http/client.py", line 1107, in request
        self._send_request(method, url, body, headers)
      File "/usr/lib/python3.5/http/client.py", line 1147, in _send_request
        self.putheader(hdr, value)
      File "/usr/lib/python3.5/http/client.py", line 1083, in putheader
        if _is_illegal_header_value(values[i]):
    TypeError: expected string or bytes-like object
    

      and I have searched on the internet for seral times, and I think the code is right

    def download5(url, user_agent='wswp', proxy=None, num_retries=2):
        """Download function with support for proxies"""
        print('Downloading:%s'%url)
        print('Downloading --- 5')
        headers = {'User-agent': user_agent}
        print(headers)
        print(url)
        requestnew = request.Request(url, headers=headers)
        opener = request.build_opener()
        if proxy:
            proxy_params = {urlparse.urlparse(url).scheme: proxy}
            opener.add_handler(request.ProxyHandler(proxy_params))
        try:
            #html = opener.open(requestnew).read().decode('utf-8')
            htmlrsp = opener.open(requestnew)
            html = htmlrsp.read().decode('utf-8')
    
        except request.URLError as e:
            print('Download error:%s'%e.reason)
            html = None
            if num_retries > 0:
                if hasattr(e, 'code') and 500 <= e.code < 600:
                    # retry 5XX HTTP errors
                    html = download5(url, user_agent, proxy, num_retries-1)
        return html
    

      then we check the "/usr/lib/python3.5/http/client.py"

    # the patterns for both name and value are more lenient than RFC
    # definitions to allow for backwards compatibility
    _is_legal_header_name = re.compile(rb'[^:s][^:
    ]*').fullmatch
    _is_illegal_header_value = re.compile(rb'
    (?![ 	])|
    (?![ 	
    ])').search
    
    
        def putheader(self, header, *values):
            """Send a request header line to the server.
    
            For example: h.putheader('Accept', 'text/html')
            """
            if self.__state != _CS_REQ_STARTED:
                raise CannotSendHeader()
    
            if hasattr(header, 'encode'):
                header = header.encode('ascii')
    
            if not _is_legal_header_name(header):
                raise ValueError('Invalid header name %r' % (header,))
    
            values = list(values)
            for i, one_value in enumerate(values):
                if hasattr(one_value, 'encode'):
                    values[i] = one_value.encode('latin-1')
                elif isinstance(one_value, int):
                    values[i] = str(one_value).encode('ascii')
    
                if _is_illegal_header_value(values[i]):
                    raise ValueError('Invalid header value %r' % (values[i],))
    
            value = b'
    	'.join(values)
            header = header + b': ' + value
            self._output(header)
    

      #

    >>> _is_illegal_header_value = re.compile(rb'
    (?![ 	])|
    (?![ 	
    ])').search
    >>> _is_illegal_header_value('identity')
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
    TypeError: cannot use a bytes pattern on a string-like object
    >>> vl='identity'
    >>> type(vl)
    <class 'str'>
    >>> _is_illegal_header_value(vl)
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
    TypeError: cannot use a bytes pattern on a string-like object
    

      ##匿名分组(a),(c)

    >>> p1=re.compile('(a)b(c)')                         #匿名分组(a),(c)

    #  re.match(patternstringflags=0)

    If zero or more characters at the beginning of string match the regular expression pattern, return a corresponding match object. Return None if the string does not match the pattern; note that this is different from a zero-length match.

    Note that even in MULTILINE mode, re.match() will only match at the beginning of the string and not at the beginning of each line

    >>> m = re.match('a','ASDasd')
    >>> print(m)
    None
    >>> m = re.match('a','aSDasd')
    >>> print(m)
    <_sre.SRE_Match object; span=(0, 1), match='a'>
    >>> m = re.match('a','aaaaaSDasd')
    >>> print(m)
    <_sre.SRE_Match object; span=(0, 1), match='a'>
    

      

    #Pattern.search(string[, pos[, endpos]])

    Scan through string looking for the first location where this regular expression produces a match, and return a corresponding match object. Return None if no position in the string matches the pattern;

    note that this is different from finding a zero-length match at some point in the string.

    >>> pattern = re.compile("d")
    >>> pattern.search("dog")
    <_sre.SRE_Match object; span=(0, 1), match='d'>
    >>> pattern.search("ogdd")
    <_sre.SRE_Match object; span=(2, 3), match='d'>
    >>> pattern.search("ogddddd")
    <_sre.SRE_Match object; span=(2, 3), match='d'>
    >>> pattern.search("ogddddd",1)
    <_sre.SRE_Match object; span=(2, 3), match='d'>
    >>> pattern.search("ogddddd",2)
    <_sre.SRE_Match object; span=(2, 3), match='d'>
    >>> pattern = re.compile("ddd")
    >>> pattern.search("dog")
    >>> pattern.search("ogdd")
    >>> pattern.search("ogddddd")
    <_sre.SRE_Match object; span=(2, 5), match='ddd'>
    >>> pattern.search("ogddddd",2)
    <_sre.SRE_Match object; span=(2, 5), match='ddd'>
    

      

  • 相关阅读:
    js中apply的用法(转)
    JS匿名函数的理解
    winform 添加“设置文件”
    用VMware安装虚拟系统时出现Invalid system disk,Replace the disk and then press any key
    Easy Multiple Copy to Clipboard by ZeroClipboard
    SaltStack配置管理-状态间关系
    Docker容器之Nginx
    CentOS7.2升级默认yum安装的php版本
    升级PHP版本导致zabbix无法访问解决办法
    Piwik网站访问统计软件安装
  • 原文地址:https://www.cnblogs.com/winditsway/p/12598829.html
Copyright © 2020-2023  润新知