urllib是python的一个获取url(Uniform Resource Locators,统一资源定址器)了,我们可以利用它来抓取远程的数据进行保存哦,下面整理了一些关于urllib使用中的一些关于header,代理,超时,认证,异常处理处理方法,下面一起来看看。
python3 抓取网页资源的 N 种方法
1、最简单
1 import urllib.request
2
3 response = urllib.request.urlopen('http://python.org/')
4
5 html = response.read()
2、使用 Request
1 import urllib.request
2
3 req = urllib.request.Request('http://python.org/')
4
5 response = urllib.request.urlopen(req)
6
7 the_page = response.read()
3、发送数据
1 #! /usr/bin/env python3
2
3 import urllib.parse
4
5 import urllib.request
6
7 url = 'http://localhost/login.php'
8
9 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
10
11 values = { 'act' : 'login', 'login[email]' : 'abc@abc.com', 'login[password]' : '123456' }
12
13 data = urllib.parse.urlencode(values)
14
15 req = urllib.request.Request(url, data)
16
17 req.add_header('Referer', 'http://www.python.org/')
18
19 response = urllib.request.urlopen(req)
20
21 the_page = response.read()
22
23 print(the_page.decode("utf8"))
4、发送数据和header
1 #! /usr/bin/env python3
2
3 import urllib.parse
4
5 import urllib.request
6
7 url = 'http://localhost/login.php'
8
9 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
10
11 values = { 'act' : 'login', 'login[email]' : 'abc@abc.com', 'login[password]' : '123456' }
12
13 headers = { 'User-Agent' : user_agent }
14
15 data = urllib.parse.urlencode(values)
16
17 req = urllib.request.Request(url, data, headers)
18
19 response = urllib.request.urlopen(req)
20
21 the_page = response.read()
22
23 print(the_page.decode("utf8"))
5、http 错误
1 #! /usr/bin/env python3
2
3 import urllib.request
4
5 req = urllib.request.Request('http://python.org/')
6
7 try:
8
9 urllib.request.urlopen(req)
10
11 except urllib.error.HTTPError as e:
12
13 print(e.code)
14
15 print(e.read().decode("utf8"))
6、异常处理1
1 #! /usr/bin/env python3
2
3 from urllib.request import Request, urlopen
4
5 from urllib.error import URLError, HTTPError
6
7 req = Request('http://www.python.org/')
8
9 try:
10
11 response = urlopen(req)
12
13 except HTTPError as e:
14
15 print('The (www.python.org)server couldn't fulfill the request.')
16
17 print('Error code: ', e.code)
18
19 except URLError as e:
20
21 print('We failed to reach a server.')
22
23 print('Reason: ', e.reason)
24
25 else:
26
27 print("good!")
28
29 print(response.read().decode("utf8"))
7、异常处理2
1 #! /usr/bin/env python3
2
3 from urllib.request import Request, urlopen
4
5 from urllib.error import URLError
6
7 req = Request("http://www.python.org/")
8
9 try:
10
11 response = urlopen(req)
12
13 except URLError as e:
14
15 if hasattr(e, 'reason'):
16
17 print('We failed to reach a server.')
18
19 print('Reason: ', e.reason)
20
21 elif hasattr(e, 'code'):
22
23 print('The server couldn't fulfill the request.')
24
25 print('Error code: ', e.code)
26
27 else: print("good!")
28
29 print(response.read().decode("utf8"))
8、HTTP 认证
1 #! /usr/bin/env python3
2
3 import urllib.request
4
5 # create a password manager
6
7 password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
8
9 # Add the username and password.
10
11 # If we knew the realm, we could use it instead of None.
12
13 top_level_url = "https://www.python.org/"
14
15 password_mgr.add_password(None, top_level_url, 'rekfan', 'xxxxxx')
16
17 handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
18
19 # create "opener" (OpenerDirector instance)
20
21 opener = urllib.request.build_opener(handler)
22
23 # use the opener to fetch a URL
24
25 a_url = "https://www.python.org/"
26
27 x = opener.open(a_url)
28
29 print(x.read())
30
31 # Install the opener.
32
33 # Now all calls to urllib.request.urlopen use our opener.
34
35 urllib.request.install_opener(opener)
36
37 a = urllib.request.urlopen(a_url).read().decode('utf8')
38
39 print(a)
9、使用代理
1 #! /usr/bin/env python3
2
3 import urllib.request
4
5 proxy_support = urllib.request.ProxyHandler({'sock5': 'localhost:1080'})
6
7 opener = urllib.request.build_opener(proxy_support)
8
9 urllib.request.install_opener(opener)
10
11 a = urllib.request.urlopen("http://www.python.org/").read().decode("utf8")
12
13 print(a)
10、超时
1 #! /usr/bin/env python3
2
3 import socket
4
5 import urllib.request
6
7 # timeout in seconds
8
9 timeout = 2
10
11 socket.setdefaulttimeout(timeout)
12
13 # this call to urllib.request.urlopen now uses the default timeout
14
15 # we have set in the socket module
16
17 req = urllib.request.Request('http://www.python.org/')
18
19 a = urllib.request.urlopen(req).read()
20
21 print(a)