• urllib源码简单分析


    对下面这段代码做分析

    import urllib
    params = urllib.urlencode({'wd': 'python'})
    f = urllib.urlopen("http://www.baidu.com/s?%s" % params)
    print f.read()

    这是一段简单读取url内容的代码

    此处最关键的是urlopen,通过查看,可以看到urlopen的代码如下

    def urlopen(url, data=None, proxies=None):
        """Create a file-like object for the specified URL to read from."""
        from warnings import warnpy3k
        warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
                 "favor of urllib2.urlopen()", stacklevel=2)
    
        global _urlopener
        if proxies is not None:
            opener = FancyURLopener(proxies=proxies)
        elif not _urlopener:
            opener = FancyURLopener()
            _urlopener = opener
        else:
            opener = _urlopener
        if data is None:
            return opener.open(url)
        else:
            return opener.open(url, data)

    通过一个FancyURLopener的opener实例,因为这里没有proxies参数,所以调用到opener = FancyURLopener()这一句。

    然后返回opener.open(url),绑定到f实例上。在这里,有两个关键,一个是opener实例,一个是open方法。

    先来说说opener,opener是FancyURLopener()的对象,而FancyURLopener的父类是URLopener基类,而FancyURLopener这个类本身只做了一些http的异常响应处理,因此我们需要了解核心的基类,也就是看看URLopener到底做了什么?

    URLopener:通过查看源码,发现URLopener的主要处理方法是open。

        def open(self, fullurl, data=None):
            """Use URLopener().open(file) instead of open(file, 'r')."""
            fullurl = unwrap(toBytes(fullurl))
            # percent encode url, fixing lame server errors for e.g, like space
            # within url paths.
            fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
            if self.tempcache and fullurl in self.tempcache:
                filename, headers = self.tempcache[fullurl]
                fp = open(filename, 'rb')
                return addinfourl(fp, headers, fullurl)
            urltype, url = splittype(fullurl)
            if not urltype:
                urltype = 'file'
            if urltype in self.proxies:
                proxy = self.proxies[urltype]
                urltype, proxyhost = splittype(proxy)
                host, selector = splithost(proxyhost)
                url = (host, fullurl) # Signal special case to open_*()
            else:
                proxy = None
            name = 'open_' + urltype
            self.type = urltype
            name = name.replace('-', '_')
            if not hasattr(self, name):
                if proxy:
                    return self.open_unknown_proxy(proxy, fullurl, data)
                else:
                    return self.open_unknown(fullurl, data)
            try:
                if data is None:
                    return getattr(self, name)(url)
                else:
                    return getattr(self, name)(url, data)
            except socket.error, msg:
                raise IOError, ('socket error', msg), sys.exc_info()[2]

    open通过处理url,将name拼接成name = 'open_' + urltype的格式,也就是说,如果是http请求,则name为open_http。在上文那段代码里,最后调用返回的是getattr(self, name)(url),而由于name变成了open_http,则继续调用open_http方法。在这里可以看出,urllib根据你的type来给出不同的方法作处理。

    那么又要看看open_http干了什么.

    通过debug发现,open_http其实是用httplib来做底层处理的

        def open_http(self, url, data=None):
            """Use HTTP protocol."""
            import httplib
            user_passwd = None
            proxy_passwd= None
            if isinstance(url, str):
                host, selector = splithost(url)
                if host:
                    user_passwd, host = splituser(host)
                    host = unquote(host)
                realhost = host
            else:
                host, selector = url
                # check whether the proxy contains authorization information
                proxy_passwd, host = splituser(host)
                # now we proceed with the url we want to obtain
                urltype, rest = splittype(selector)
                url = rest
                user_passwd = None
                if urltype.lower() != 'http':
                    realhost = None
                else:
                    realhost, rest = splithost(rest)
                    if realhost:
                        user_passwd, realhost = splituser(realhost)
                    if user_passwd:
                        selector = "%s://%s%s" % (urltype, realhost, rest)
                    if proxy_bypass(realhost):
                        host = realhost
    
                #print "proxy via http:", host, selector
            if not host: raise IOError, ('http error', 'no host given')
    
            if proxy_passwd:
                proxy_passwd = unquote(proxy_passwd)
                proxy_auth = base64.b64encode(proxy_passwd).strip()
            else:
                proxy_auth = None
    
            if user_passwd:
                user_passwd = unquote(user_passwd)
                auth = base64.b64encode(user_passwd).strip()
            else:
                auth = None
            h = httplib.HTTP(host)
            if data is not None:
                h.putrequest('POST', selector)
                h.putheader('Content-Type', 'application/x-www-form-urlencoded')
                h.putheader('Content-Length', '%d' % len(data))
            else:
                h.putrequest('GET', selector)
            if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
            if auth: h.putheader('Authorization', 'Basic %s' % auth)
            if realhost: h.putheader('Host', realhost)
            for args in self.addheaders: h.putheader(*args)
            h.endheaders(data)
            errcode, errmsg, headers = h.getreply()
            fp = h.getfile()

    红色加粗为核心处理部分。可以看出,这是通过切割host和请求参数后来对服务器发起请求并处理response的过程。

    到目前为止,我们可以发现,真正向服务器发起请求的是这一句:h.putrequest('GET', selector).

    那么继续追踪定位,

    hdr = '%s %s %s' % (method, url, self._http_vsn_str),实际hdr为:'GET /s?wd=python HTTP/1.0',然后输出self._output(hdr),而这个_output的作用是向当前请求缓冲区添加一行输出。然后通过以下方法返回buffer中的内容放置在一个fp的对象里。

            try:
                if not buffering:
                    response = self._conn.getresponse()
                else:
                    #only add this keyword if non-default for compatibility
                    #with other connection classes
                    response = self._conn.getresponse(buffering)
            except BadStatusLine, e:
                ### hmm. if getresponse() ever closes the socket on a bad request,
                ### then we are going to have problems with self.sock
    
                ### should we keep this behavior? do people use it?
                # keep the socket open (as a file), and return it
                self.file = self._conn.sock.makefile('rb', 0)
    
                # close our socket -- we want to restart after any protocol error
                self.close()
    
                self.headers = None
                return -1, e.line, None

    最后通过一个迭代器不断读回文件内容。

    class addbase:
        """Base class for addinfo and addclosehook."""
    
        def __init__(self, fp):
            self.fp = fp
            self.read = self.fp.read
            self.readline = self.fp.readline
            if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
            if hasattr(self.fp, "fileno"):
                self.fileno = self.fp.fileno
            else:
                self.fileno = lambda: None
            if hasattr(self.fp, "__iter__"):
                self.__iter__ = self.fp.__iter__
                if hasattr(self.fp, "next"):
                    self.next = self.fp.next
    
    ...

     

  • 相关阅读:
    em,pt和px之间的换算
    css中 中文字体(fontfamily)的标准英文名称
    HTML css面试题
    css实现的透明三角形
    JavaScript经典面试题系列
    C++ template 学习笔记(第二章)
    C++ template 学习笔记 (第五章)
    20120906
    C++ template 学习笔记(第十六章) 16.1 命名模版参数
    C++ template 学习笔记(第三,四章)
  • 原文地址:https://www.cnblogs.com/alexkn/p/4822707.html
Copyright © 2020-2023  润新知