用到的主要知识点:
requests.get 获取网页HTML
etree.HTML 使用lxml解析器解析网页
xpath 使用xpath获取网页标签信息、图片地址
request.urlretrieve 下载图片(注:该网站使用urlretrieve下载图片时,返回403错误。原因目前未知!)
改用 with as 下载图片:
with open('文件地址及名字', 'wb') as f:
f.write(res.content)
详细代码如下:
#!/user/bin env python # author:Simple-Sir # time:2019/7/17 10:14 # 爬取某网站的壁纸图片 import requests from lxml import etree from urllib import request import urllib import time # 伪装浏览器 headers ={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } # 获取壁纸首页网页信息并解析 def getUrlText(url): respons = requests.get(url,headers=headers) # 获取网页信息 urlText = respons.text html = etree.HTML(urlText) # 使用lxml解析网页 return html # 提取壁纸链接地址列表 def getWallUrl(url): hrefUrl = getUrlText(url) section = hrefUrl.xpath('//section[@class="thumb-listing-page"]')[0] # 获取section标签 hrefList = section.xpath('./ul//@href') # 获取首页图片对应链接地址 return hrefList # 获取当前时间 def getTime(): nowtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) return nowtime # 解析壁纸下载地址 def downWall(url,page): ''' :param url: 网页地址 :param page: 下载页数 :return: 下载结束提醒 ''' m = 0 page += 1 for i in range(1,page): hrefList = getWallUrl(url+str(i)) n = 0 print('