说明:
1. 这个其实是在下载漫画之前写的,比那个稍微简单点,之前忘放到博客备份了。
2. 不想说啥了,总结放到漫画那个里面吧!
1 import urllib.request 2 import re 3 import os 4 5 # http://jandan.net/ooxx/page-2381#comments 6 # <span class="current-comment-page">[2381]</span> 7 # <img src="//wx4.sinaimg.cn/orj360/797ccd21gy1fdcjecuo1jj20qo0usacj.jpg" style="max- 480px; max-height: 750px; background-color: rgb(246, 161, 181);"> 8 # <a href="//ww1.sinaimg.cn/large/6715afcfgw1ef4zrjdaswj20js0qotag.jpg" target="_blank" class="view_img_link">[查看原图]</a> 9 url = "http://jandan.net/ooxx/page-2381#comments" 10 headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0"} 11 12 # 打开主网站url,获取整个html字符串 13 req = urllib.request.Request(url=url,headers=headers) 14 response = urllib.request.urlopen(req) 15 html = response.read().decode("utf-8") 16 # print(html[0:1000]) 17 # 分析html字符串,找出页数和图片地址 18 page = html.find("current-comment-page") 19 page = html[page+23:page+27] 20 # print(page) 21 htmlPages = "" 22 for i in range(int(page)-10,int(page)): 23 urlPage = "http://jandan.net/ooxx/page-"+str(i)+"#comments" 24 reqPage = urllib.request.Request(url=urlPage,headers=headers) 25 responsePage = urllib.request.urlopen(reqPage) 26 htmlPages += responsePage.read().decode("utf-8") 27 regImg = r"//[0-9a-z]+.sinaimg.cn/large/[0-9a-z]+.jpg" 28 imgUrl = re.findall(regImg,htmlPages) 29 # print(imgUrl) 30 imgNum = len(imgUrl) 31 # print(imgNum) 32 # 创建文件夹 33 os.mkdir("test") 34 # 切换到这个文件夹 35 os.chdir("test") 36 37 38 # 打开每个图片地址,保存图片到本地 39 for i in range(imgNum): 40 req = urllib.request.Request(url="http:"+imgUrl[i],headers=headers) 41 responseImg = urllib.request.urlopen(req) 42 img = open(str(i)+".jpg","wb") 43 img.write(responseImg.read()) 44 img.close
小甲鱼源码(论坛里复制来的,其实是可以运行的,每个图片地址加上http:就可以了):
1 import urllib.request 2 import os 3 import random 4 # 煎蛋网已经禁用爬虫了,所以此程序无法运行 5 def url_open(url): 6 req = urllib.request.Request(url) 7 req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36') 8 # iplist = ['111.197.141.57:9797','116.228.236.219:8080','120.26.51.101:8118','113.222.80.216:3128','117.90.1.88:9000'] 9 # proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)}) 10 # opener = urllib.request.build_opener(proxy_support) 11 # urllib.request.install_opener(opener) 12 response = urllib.request.urlopen(url) 13 html = response.read() 14 return html 15 16 def get_page(url): 17 html = url_open(url).decode('utf-8') 18 a = html.find('current-comment-page') + 23 19 b = html.find(']',a) 20 return html[a:b] 21 22 def find_imgs(url): 23 html = url_open(url).decode('utf-8') 24 img_addrs = [] 25 a = html.find('img src=') 26 while a != -1: 27 b = html.find('.jpg', a, a + 100) 28 29 if b != -1: 30 img_addrs.append(html[a+9:b+4]) 31 print('图片地址:'+html[a+9:b+4]) 32 else: 33 b = a + 9 34 a = html.find('img src=', b) 35 return img_addrs 36 def save_imgs(folder, img_addrs): 37 for each in img_addrs: 38 filename = each.split('/')[-1] 39 with open(filename, 'wb') as f: 40 img = url_open("http:"+each) 41 f.write(img) 42 43 def download_mm(folder = 'Xman', pages = 1): 44 os.mkdir(folder) 45 os.chdir(folder) 46 url = "http://jandan.net/ooxx/" 47 page_num = int(get_page(url)) 48 for i in range(pages): 49 page_num -= i 50 page_url = url + 'page-' + str(page_num) + '#comments' 51 img_addrs = find_imgs(page_url) 52 save_imgs(folder, img_addrs) 53 if __name__ == '__main__': 54 download_mm()