urllib的介绍
url = 'http://www/baidu.com/' # 向ོ指ོ定ོ的ོUོRོLོ地ོ址ོ发ོ起ོ请ོ求ོ,并ོ返ོ回ོ服ོ务ོ器ོ响ོ应ོ的ོ数ོ据ོ response = urllib.request.urlopen("http://www.baidu.com") # 第ོ一ོ种ོ读ོ取ོ方ོ式ོ data = response.read() # 第ོ二ོ种ོ读ོ取ོ方ོ式ོ 读ོ取ོ一ོ行ོ data = response.readline() # 第ོ三ོ种ོ读ོ取ོ方ོ式ོ, 读ོ取ོ文ོ件ོ的ོ全ོ部ོ内ོ容ོ, 会ོ把ོ读ོ取ོ到ོ的ོ数ོ据ོ复赋ོ值ོ给ོ一ོ个ོ列ོ表ོ变ོ量ོ data = response.readlines() with open(r'/Users/mac/PycharmProjects/TEST/TEST/爬虫day/file/file.html', 'wb') as f: f.write(data)
response 常ོ用ོ的ོ属ོ性ོ
print(response.info()) # 返ོ回ོ状ོ态ོ码ོ print(response.getcode()) # 返ོ回ོ正ོ在ོ爬ོ取ོ的ོUོRོLོ地ོ址ོ pོrོiོnོtོ(response.geturl()) #解ོ码ོ出ོUོRོLོ中ོ的ོ汉ོ字ོ pོrོiོnོtོ(urllib.request.unquote('url')) #编ོ码ོ print(urllib.request.quote('url'))
爬ོ取ོ到ོ的ོ网ོ页ོ直ོ接ོ写ོ入ོ文ོ件ོ
# urllib.request.urlretrieve('http://www.baidu.com', # filename=r'/Users/mac/PycharmProjects/TEST/TEST/爬虫day/file/file2.html') # 清除缓存 # urllib.request.urlcleanup()
模ོ拟ོ浏ོ览ོ器ོ
# 3 模拟浏览器 # url = 'http://www.baidu.com'
# 请ོ求ོ头ོ # headers = { # 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) " # "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" # } # # 设置一个请求体 # req = urllib.request.Request(url, headers=headers) # # 发起请求 # response = urllib.request.urlopen(req) # data = response.read()
# 设置超时机制 # 如果网页长时间未响应, 系统判断超时, 无法爬取 # for i in range(1, 100): # try: # response = urllib.request.urlopen( # "http://www.baidu.com", timeout=0.5) # print(len(response.read().decode('utf-8'))) # except: # print('请求超时, 继续下一个爬取')
参ོ数ོ打ོ包ོ# 打包
# url = "http://www.sunck.wang:8085/form" # data = { # "username": "sunck", # "passwd": "666" # } # # 对要发送的数据进行打包, 注意编码 # postData = urllib.parse.urlencode(data).encode('utf-8') # # 请求体 # # 请求 # headers = { # 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) " # "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" # } # # 设置一个请求体 # req = urllib.request.Request(url, headers=headers, data=postData) # # 发起请求 # response = urllib.request.urlopen(req) # data = response.read() # print(data.decode('utf-8'))
urlencode 编码
举例:
import urllib
param = {'name': '王月'}
print(urllib.parse.urlencode(param))
结果:name=%E7%8E%8B%E6%9C%88
parse_qs 解码
举例:
name = urllib.parse.urlencode(param)
print(urllib.parse.parse_qs(name))
{'name': ['王月']}
处ོ理ོHོTོTོPོSོ的ོ网ོ址ོ
import ssl import json # # def ajaxCrawler(url): # # headers = { # 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) " # "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" # } # req = urllib.request.Request(url, headers=headers) # # # 使用ssl使用未验证的上下文 # context = ssl._create_unverified_context() # response = urllib.request.urlopen(req, context=context) # json_str = response.read().decode('utf-8') # json_data = json.loads(json_str) # # # return json_data # # url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=20&limit=20" # info = ajaxCrawler(url) # print(info) # 自动抓取下一页 # for i in range(10): # url2 = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=" + str( # i * 20) + "&limit=20" # info = ajaxCrawler(url2) # print(len(info)) # def joyCrawler(url): headers = { 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } req = urllib.request.Request(url, headers=headers) context = ssl._create_unverified_context() response = urllib.request.urlopen(req, context=context) HTML = response.read().decode('utf-8') with open(r'/Users/mac/PycharmProjects/TEST/TEST/爬虫day/file/file3.html', 'w') as f: f.write(HTML) url = 'https://www.qiushibaike.com/8hr/page/2/' print(joyCrawler(url))
urlparse和urlsplit 函数:
url = 'http://www.baidu.com/s?wd=python&username=abc#1' result = urllib.parse.urlparse(url) print(result) 结果: ParseResult(scheme='http', netloc='www.baidu.com', path='/s', params='', query='wd=python&username=abc', fragment='1') 取值: print('path:', result.path) result2 = urllib.parse.urlsplit(url) print(result2) 结果:plitResult(scheme='http', netloc='www.baidu.com', path='/s', query='wd=python&username=abc', fragment='1') 区别:urlsplit 没有params urlparse 有params