爬取彼岸网站上图片信息,并将图片下载下来分类保存
思路: 先获取不同类别的链接信息,再获取不同类别图片分页的链接,进入图片详情页面获取图片下载地址,下载图片并分类保存
代码如下:
# encoding:utf-8 import requests from lxml import etree import os, time header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"} base_url = "http://pic.netbian.com/" def send_request(url): # 发送请求函数 time.sleep(2) response = requests.get(url=url, headers=header).content return response def get_catrgories_link(): resp = send_request(base_url) html = etree.HTML(resp) links = html.xpath("//div[@class='classify clearfix']/a") for link in links: lik = base_url + link.xpath("./@href")[0] title = link.xpath("./@title")[0] name_link(title, lik) def name_link(dir_name, link): # 抓取前4页面的图片 for i in range(1, 5): if i == 1: url = link else: url = link + "index_" + str(i) + ".html" resp = send_request(url) html = etree.HTML(resp) links = html.xpath("//div[@class='slist']/ul/li") for lik in links: img_name = lik.xpath("./a/img/@alt")[0] img_link = base_url + lik.xpath("./a/@href")[0] img_name_url(dir_name, img_name, img_link) def img_name_url(dir_name, img_name, img_link): resp = send_request(img_link) html = etree.HTML(resp) image_url = base_url + html.xpath("//*[@id='img']/img/@src")[0] download(dir_name, img_name, image_url) count = 1 def download(dir_name, img_name, image_url): # 下载图片 global count path = "彼岸图库/{}".format(dir_name) if not os.path.exists(path): os.makedirs(path) print('-------[{}]文件夹已经创建成功,开始下载图片-------'.format(img_name)) print('正在下载{}, 这是第{}张图片'.format(img_name, count)) rep = send_request(image_url) with open('彼岸图库/{}/{}.jpg'.format(dir_name, img_name), 'wb')as f: count += 1 f.write(rep) print('{}已经成功下载, 这是第{}张图片'.format(img_name, count)) get_catrgories_link()