爬虫基础 urllib
库的学习记录
说明:urllib是一个收集几个模块的以处理URL的包
urilib.request
用于打开和阅读URLurllib.error
包含由...提出的例外urllib.parse
用于解析URLurllib.robotparser
用于解析robots.txt
文件
1 urllib.request
1.1 发送请求 urllib.request
urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)
url
: 需要打开的网址data
:post提交的数据timeout
:设置网站的访问超时时间
1.1.1 发送 get
请求
基本的使用
import urllib.request
response = urllib.request.urlopen('http://www.baidu.com')
# response.read() -- > bytes
print(response.read().decode('utf-8'))
1.1.2 发送 post
请求
# 1. 对 data 进行处理
import urllib.parse
data = bytes(urllib.parse.urlencode({'hello':'word'}),encoding='utf-8')
# 处理后的结果 b'hello=word'
# 2. 使用request.urlopen() 发送请求
response = urllib.request.urlopen('http://www.httpbin.org/post',data=data)
response._method # 获取请求方法
'POST'
response.url # 获取请求的url
'http://www.httpbin.org/post'
1.1.3 超时设置
import urllib.request
response = urllib.request.urlopen('http://httpbin.org/get',timeout=1)
print(response.read())
import socket
import urllib.request
import urllib.error
try:
response = urllib.request.urlopen('http://httpbin.org/get',timeout=0.1)
except urllib.error.URLError as e:
if isinstance(e.reason,socket.timeout):
print('TIME OUT')
1.2 响应对象 Reponse
1.2.1 响应对象
print(dir(response)
['__abstractmethods__',
'__class__',
'__del__',
'__delattr__',
'__dict__',
'__dir__',
'__doc__',
'__enter__',
'__eq__',
'__exit__',
'__format__',
'__ge__',
'__getattribute__',
'__gt__',
'__hash__',
'__init__',
'__init_subclass__',
'__iter__',
'__le__',
'__lt__',
'__module__',
'__ne__',
'__new__',
'__next__',
'__reduce__',
'__reduce_ex__',
'__repr__',
'__setattr__',
'__sizeof__',
'__str__',
'__subclasshook__',
'_abc_cache',
'_abc_negative_cache',
'_abc_negative_cache_version',
'_abc_registry',
'_checkClosed',
'_checkReadable',
'_checkSeekable',
'_checkWritable',
'_check_close',
'_close_conn',
'_get_chunk_left',
'_method',
'_peek_chunked',
'_read1_chunked',
'_read_and_discard_trailer',
'_read_next_chunk_size',
'_read_status',
'_readall_chunked',
'_readinto_chunked',
'_safe_read',
'_safe_readinto',
'begin',
'chunk_left',
'chunked',
'close',
'closed',
'code',
'debuglevel',
'detach',
'fileno',
'flush',
'fp',
'getcode',
'getheader',
'getheaders',
'geturl',
'headers',
'info',
'isatty',
'isclosed',
'length',
'msg',
'peek',
'read',
'read1',
'readable',
'readinto',
'readinto1',
'readline',
'readlines',
'reason',
'seek',
'seekable',
'status',
'tell',
'truncate',
'url',
'version',
'will_close',
'writable',
'write',
'writelines'
]
1.2.2 常用的几个方法
response.status # 200
response.getcode() # 200
response.code # 200
response.url # 'http://www.baidu.com'
response._method # 'GET'
response.getheaders()
[
('Access-Control-Allow-Credentials', 'true'),
('Access-Control-Allow-Origin', '*'),
('Content-Type', 'application/json'),
('Date', 'Fri, 14 Jun 2019 02:33:18 GMT'),
('Referrer-Policy', 'no-referrer-when-downgrade'),
('Server', 'nginx'),
('X-Content-Type-Options', 'nosniff'),
('X-Frame-Options', 'DENY'),
('X-XSS-Protection', '1; mode=block'),
('Content-Length', '226'),
('Connection', 'Close')
]
response.getheader('Server') # nginx
1.3 构造请求 Request
class Request:
def __init__(self, url, data=None, headers={},
origin_req_host=None, unverifiable=False,
method=None):
In [3]: url = 'https://www.baidu.com/'
In [4]: req = urllib.request.Request(url=url)
In [5]: dir(req)
Out[5]:
['__class__',
'__delattr__',
'__dict__',
'__dir__',
'__doc__',
'__eq__',
'__format__',
'__ge__',
'__getattribute__',
'__gt__',
'__hash__',
'__init__',
'__init_subclass__',
'__le__',
'__lt__',
'__module__',
'__ne__',
'__new__',
'__reduce__',
'__reduce_ex__',
'__repr__',
'__setattr__',
'__sizeof__',
'__str__',
'__subclasshook__',
'__weakref__',
'_data',
'_full_url',
'_parse',
'_tunnel_host',
'add_header',
'add_unredirected_header',
'data',
'fragment',
'full_url',
'get_full_url',
'get_header',
'get_method',
'has_header',
'has_proxy',
'header_items',
'headers',
'host',
'origin_req_host',
'remove_header',
'selector',
'set_proxy',
'type',
'unredirected_hdrs',
'unverifiable']
data
post数据的处理headers={}
构造请求头信息
1.3.1 基本的使用
import urllib.request
import urllib.parse
url = 'http://httpbin.org/post'
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
headers['Host'] = 'httpbin.org'
# data -- bytes
dict = {'name': 'Germey'}
data = urllib.parse.urlencode(dict).encode('utf-8')
# 实例化请求对象 传入参数
request = urllib.request.Request(url=url, data=data, headers=headers, method='POST')
print(request)
# <urllib.request.Request object at 0x000002404A9689E8>
##############################################################
# --------以上只不过是创建请求对象但并没有发送请求------------
#############################################################
# 发送请求对象并返回响应对象
response = urllib.request.urlopen(request)
print(response)
# <http.client.HTTPResponse object at 0x000002404AFBC358>
1.3.2 添加Header
- 构造请求时进行添加
# 增加header
from urllib import request, parse
url = 'http://httpbin.org/post'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
'Host':'httpbin.org'
}
# 构造POST表格
dict = {
'name':'Germey'
}
data = bytes(parse.urlencode(dict),encoding='utf8')
req = request.Request(url=url,data=data,headers=headers,method='POST')
response = request.urlopen(req)
print(response.read()).decode('utf-8')
- 使用
add_header
方法添加
import urllib.request
req = urllib.request.Request('http://www.example.com/')
req.add_header('Referer', 'http://www.python.org/')
# Customize the default User-Agent header value:
req.add_header('User-Agent', 'urllib-example/0.1 (Contact: . . .)')
r = urllib.request.urlopen(req)
1.3.3 设置Cookie
import http.cookiejar, urllib.request
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open("http://www.baidu.com")
for item in cookie:
print(item.name+"="+item.value)
# 保存cooki为文本
import http.cookiejar, urllib.request
filename = "cookie.txt"
# 保存类型有很多种
## 类型1
cookie = http.cookiejar.MozillaCookieJar(filename)
## 类型2
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open("http://www.baidu.com")
# 使用相应的方法读取
import http.cookiejar, urllib.request
cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookie.txt',ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open("http://www.baidu.com")
1.3.4 设置代理
from urllib import request
url = 'http://httpbin.org/ip'
proxy = {'http':'218.18.232.26:80','https':'218.18.232.26:80'}
# 创建代理处理器
proxies = request.ProxyHandler(proxy)
# 创建opener对象
opener = request.build_opener(proxies)
resp = opener.open(url)
print(resp.read().decode())
2 url解析 urllib.parse
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
"urlsplit", "urlunsplit", "urlencode", "parse_qs",
"parse_qsl", "quote", "quote_plus", "quote_from_bytes",
"unquote", "unquote_plus", "unquote_to_bytes",
"DefragResult", "ParseResult", "SplitResult",
"DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
urlparse
urlunpars
quote/quote_plus
unquote/unquote_plus
urljoin
urlencode
parse_qs/parse_qsl
2.1 url
解析 urlparse
In [8]: from urllib.parse import urlparse
In [9]: o = urlparse('https://docs.python.org/3/library/urllib.parse.html')
'''
将url分成六个部分,返回一个包含6个字符串项目的元组:
协议 : scheme
位置 : netloc
路径 : path
参数 .....
查询
判断
输出结果如下
'''
In [10]: o
Out[10]: ParseResult(scheme='https', netloc='docs.python.org', path='/3/library/urllib.parse.html', params='', query='', fragment='')
In [11]: dir(o)
Out[11]:
['__add__',
'__class__',
'__contains__',
'__delattr__',
'__dir__',
'__doc__',
'__eq__',
'__format__',
'__ge__',
'__getattribute__',
'__getitem__',
'__getnewargs__',
'__gt__',
'__hash__',
'__init__',
'__init_subclass__',
'__iter__',
'__le__',
'__len__',
'__lt__',
'__module__',
'__mul__',
'__ne__',
'__new__',
'__reduce__',
'__reduce_ex__',
'__repr__',
'__rmul__',
'__setattr__',
'__sizeof__',
'__slots__',
'__str__',
'__subclasshook__',
'_asdict',
'_encoded_counterpart',
'_fields',
'_hostinfo',
'_make',
'_replace',
'_source',
'_userinfo',
'count',
'encode',
'fragment',
'geturl',
'hostname',
'index',
'netloc',
'params',
'password',
'path',
'port',
'query',
'scheme',
'username']
In [12]: o.path
Out[12]: '/3/library/urllib.parse.html'
In [13]: o.scheme
Out[13]: 'https'
In [14]: o.geturl()
Out[14]: 'https://docs.python.org/3/library/urllib.parse.html'
url = "https://docs.python.org/3.5/library/urllib.parse.html?highlight=parse#module-urllib.parse"
result = parse.urlparse(url)
print(result.query) # 获取返回结果参数内容
print(parse.parse_qs(result.query)) # 结果转换成字典
print(parse.parse_qsl(result.query)) # 结果转换成列表
2.2 url
解析 urlunpars
In [15]: o
Out[15]: ParseResult(scheme='https', netloc='docs.python.org', path='/3/library/urllib.parse.html', params='', query='', fragment='')
In [16]: from urllib.parse import urlunparse
In [17]: urlunparse(o)
Out[17]: 'https://docs.python.org/3/library/urllib.parse.html'
# list(o)
In [18]: urlunparse(list(o))
Out[18]: 'https://docs.python.org/3/library/urllib.parse.html'
2.3 url
解析 parse_qs/parse_qsl
In [52]: parse_qs('https://i.cnblogs.com/EditPosts.aspx?opt=1')
Out[52]: {'https://i.cnblogs.com/EditPosts.aspx?opt': ['1']}
In [53]: parse_qsl('https://i.cnblogs.com/EditPosts.aspx?opt=1')
Out[53]: [('https://i.cnblogs.com/EditPosts.aspx?opt', '1')]
2.4 url
解析 quote/unquote
Help on function quote in module urllib.parse:
quote(string, safe='/', encoding=None, errors=None)
quote('abc def') -> 'abc%20def'
Each part of a URL, e.g. the path info, the query, etc., has a
different set of reserved characters that must be quoted.
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
the following reserved characters.
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
"$" | ","
Each of these characters is reserved in some component of a URL,
but not necessarily in all of them.
By default, the quote function is intended for quoting the path
section of a URL. Thus, it will not encode '/'. This character
is reserved, but in typical usage the quote function is being
called on a path where the existing slash characters are used as
reserved characters.
string and safe may be either str or bytes objects. encoding and errors
must not be specified if string is a bytes object.
The optional encoding and errors parameters specify how to deal with
non-ASCII characters, as accepted by the str.encode method.
By default, encoding='utf-8' (characters are encoded with UTF-8), and
errors='strict' (unsupported characters raise a UnicodeEncodeError).
In [26]: search = '搜索内容'
In [27]: quote(search)
Out[27]: '%E6%90%9C%E7%B4%A2%E5%86%85%E5%AE%B9'
2.5 url
反向解析 unquote/unquote_plus
In [41]: from urllib import parse
In [42]: parse.quote('a&b/c')
Out[42]: 'a%26b/c' # 未编码斜线
In [43]: parse.quote_plus('a&b/c')
Out[43]: 'a%26b%2Fc' # 编码了斜线
2.6 url
urlencode
In [44]: query = {
...:
...: 'name': 'Lee',
...:
...: 'age': 19,
...:
...: }
In [45]: type(query)
Out[45]: dict
In [46]: parse.urlencode(query)
Out[46]: 'name=Lee&age=19'
2.6.1 GET
的请求方式
>>> import urllib.request
>>> import urllib.parse
>>> params = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0})
>>> url = "http://www.musi-cal.com/cgi-bin/query?%s" % params
>>> with urllib.request.urlopen(url) as f:
... print(f.read().decode('utf-8'))
2.6.2 POST
的请求方式
>>> import urllib.request
>>> import urllib.parse
>>> data = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0})
>>> data = data.encode('ascii')
>>> with urllib.request.urlopen("http://requestb.in/xrbl82xr", data) as f:
... print(f.read().decode('utf-8'))
2.7 其余方法-见参考文档
本文参考链接
GitHub源码: https://github.com/python/cpython/blob/3.7/Lib/urllib/request.py
博主Hoptop : https://www.jianshu.com/u/9ea40b5f607a