import requests
from PIL import Image
from io import BytesIO
import re
from requests.exceptions import HTTPError
root = "http://pic.netbian.com/index_%d.html"
# 除首页外所有分页的统一格式
uni = "http://pic.netbian.com"
# 子页面和大图的URL前缀
AllPage = []
# 要爬取的分页URL
AllImgHTML = []
# 缩略图代表的子页面的部分URL,格式为/tupian/.*.html
AllImgURL = []
# 每张大图的部分URL,格式为/uploads/allimg/.*.jpg
def GetPageURL(root, Start, counts):
# 得到每个分页的URL放到AllPage中
if Start == 1:
AllPage.append("http://pic.netbian.com/index.html")
# 将非标准格式的首页URL放入
for i in range(Start + 1, Start + counts):
newURL = root.replace("%d", str(i))
AllPage.append(newURL)
else:
for i in range(Start, Start + counts):
newURL = root.replace("%d", str(i))
AllPage.append(newURL)
def GetImgHTML(AllPage):
# 得到每个分页中子页面的URL放到AllImgHTML中
for PageURL in AllPage:
try:
res = requests.get(PageURL)
res.raise_for_status()
except HTTPError:
print("HTTP Error!")
except ConnectionError:
print("Failed to connect!")
with open("F:\1\PageFile.txt", "w", encoding="ISO-8859-1") as PageFile:
PageFile.write(res.text)
PageFile.close()
with open("F:\1\PageFile.txt", "r", encoding="gbk") as ReadFile:
str = ReadFile.read()
mid = re.split(""", str)
print(mid)
# 用"进行分割,以进行正则表达式匹配
for i in mid:
ImgHTML = re.findall("^/tupian/.*.html$", i)
# 提取所有符合格式的str放到ImgHTML中
if len(ImgHTML) != 0:
AllImgHTML.append(ImgHTML[0])
def GetImgURL():
# 得到每个分页中每个子页面的大图的URL放到UsefulImgURL中
UsefulImgHTML = [None for i in range(len(AllImgHTML))]
# 为字符串拼接分配内存
for i in range(len(AllImgHTML)):
UsefulImgHTML[i] = uni + AllImgHTML[i]
# 拼接后得到了可用的子页面URL,格式为http://pic.netbian.com//tupian/.*.html
for html in UsefulImgHTML:
# 对图片组进行请求
try:
htmlres = requests.get(html)
htmlres.raise_for_status()
except HTTPError:
print("HTTP Error!")
except ConnectionError:
print("Failed to connect!")
with open("F:\1\ImgHTML.txt", "w", encoding="ISO-8859-1") as ImgHTML:
ImgHTML.write(htmlres.text)
ImgHTML.close()
with open("F:\1\ImgHTML.txt", "r", encoding="gbk") as ReadHTML:
str = ReadHTML.read()
mid = re.split(""", str)
for i in mid:
ImgURL = re.search("^/uploads/allimg/.*.jpg$", i)
if ImgURL is not None:
AllImgURL.append(ImgURL[0])
break
# 爬到一个大图的URL即break。将每张大图的部分URL存入AllImgURL中,格式为/uploads/allimg/.*.jpg
UsefulImgURL = [None for i in range(len(AllImgURL))]
# 拼接得到最终可供下载的URL放到UsefulImgURL中
for i in range(len(AllImgURL)):
UsefulImgURL[i] = uni + AllImgURL[i]
return UsefulImgURL
def DownloadWallpaper(url, path):
try:
res = requests.get(url)
res.raise_for_status()
MyImage = Image.open(BytesIO(res.content))
MyImage.save(path)
print("Done...")
except HTTPError:
print("HTTP Error!")
except ConnectionError:
print("Failed to connect!")
if __name__ == "__main__":
GetPageURL(root, 1,2 )
GetImgHTML(AllPage)
UsefulImgURL = GetImgURL()
num = []
for i in range(len(UsefulImgURL)):
num.append(i)
UsefulSavePath = [None for i in range(len(UsefulImgURL))]
for i in range(len(UsefulSavePath)):
UsefulSavePath[i] = "F:\1\" + str(num[i]) + ".jpg"
for i in range(len(UsefulImgURL)):
print(i, end=" ")
DownloadWallpaper(UsefulImgURL[i], UsefulSavePath[i])
print("Task completed!")