• python学习笔记(12)--爬虫下载煎蛋网图片


    说明:

    1. 这个其实是在下载漫画之前写的,比那个稍微简单点,之前忘放到博客备份了。

    2. 不想说啥了,总结放到漫画那个里面吧!

     1 import urllib.request
     2 import re
     3 import os
     4 
     5 # http://jandan.net/ooxx/page-2381#comments
     6 # <span class="current-comment-page">[2381]</span>
     7 # <img src="//wx4.sinaimg.cn/orj360/797ccd21gy1fdcjecuo1jj20qo0usacj.jpg" style="max- 480px; max-height: 750px; background-color: rgb(246, 161, 181);">
     8 # <a href="//ww1.sinaimg.cn/large/6715afcfgw1ef4zrjdaswj20js0qotag.jpg" target="_blank" class="view_img_link">[查看原图]</a>
     9 url = "http://jandan.net/ooxx/page-2381#comments"
    10 headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0"}
    11 
    12 # 打开主网站url,获取整个html字符串
    13 req = urllib.request.Request(url=url,headers=headers)
    14 response = urllib.request.urlopen(req)
    15 html = response.read().decode("utf-8")
    16 # print(html[0:1000])
    17 # 分析html字符串,找出页数和图片地址
    18 page = html.find("current-comment-page")
    19 page = html[page+23:page+27]
    20 # print(page)
    21 htmlPages = ""
    22 for i in range(int(page)-10,int(page)):
    23     urlPage = "http://jandan.net/ooxx/page-"+str(i)+"#comments"
    24     reqPage = urllib.request.Request(url=urlPage,headers=headers)
    25     responsePage = urllib.request.urlopen(reqPage)
    26     htmlPages += responsePage.read().decode("utf-8")
    27 regImg = r"//[0-9a-z]+.sinaimg.cn/large/[0-9a-z]+.jpg"
    28 imgUrl = re.findall(regImg,htmlPages)
    29 # print(imgUrl)
    30 imgNum = len(imgUrl)
    31 # print(imgNum)
    32 # 创建文件夹
    33 os.mkdir("test")
    34 # 切换到这个文件夹
    35 os.chdir("test")
    36 
    37 
    38 # 打开每个图片地址,保存图片到本地
    39 for i in range(imgNum):
    40     req = urllib.request.Request(url="http:"+imgUrl[i],headers=headers)
    41     responseImg = urllib.request.urlopen(req)
    42     img = open(str(i)+".jpg","wb")
    43     img.write(responseImg.read())
    44     img.close

     小甲鱼源码(论坛里复制来的,其实是可以运行的,每个图片地址加上http:就可以了):

     1 import urllib.request
     2 import os
     3 import random
     4 # 煎蛋网已经禁用爬虫了,所以此程序无法运行
     5 def url_open(url):
     6     req = urllib.request.Request(url)
     7     req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36')
     8     # iplist = ['111.197.141.57:9797','116.228.236.219:8080','120.26.51.101:8118','113.222.80.216:3128','117.90.1.88:9000']
     9     # proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})
    10     # opener = urllib.request.build_opener(proxy_support)
    11     # urllib.request.install_opener(opener)
    12     response = urllib.request.urlopen(url)
    13     html = response.read()
    14     return html
    15 
    16 def get_page(url):
    17     html = url_open(url).decode('utf-8')
    18     a = html.find('current-comment-page') + 23
    19     b = html.find(']',a)
    20     return html[a:b]
    21 
    22 def find_imgs(url):
    23     html = url_open(url).decode('utf-8')
    24     img_addrs = []
    25     a = html.find('img src=')
    26     while a != -1:
    27         b = html.find('.jpg', a, a + 100)
    28 
    29         if b != -1:
    30             img_addrs.append(html[a+9:b+4])
    31             print('图片地址:'+html[a+9:b+4])
    32         else:
    33             b = a + 9
    34         a = html.find('img src=', b)
    35     return img_addrs
    36 def save_imgs(folder, img_addrs):
    37     for each in img_addrs:
    38         filename = each.split('/')[-1]
    39         with open(filename, 'wb') as f:
    40             img = url_open("http:"+each)
    41             f.write(img)
    42 
    43 def download_mm(folder = 'Xman', pages = 1):
    44     os.mkdir(folder)
    45     os.chdir(folder)
    46     url = "http://jandan.net/ooxx/"
    47     page_num = int(get_page(url))
    48     for i in range(pages):
    49         page_num -= i
    50         page_url = url + 'page-' + str(page_num) + '#comments'
    51         img_addrs = find_imgs(page_url)
    52         save_imgs(folder, img_addrs)
    53 if __name__ == '__main__':
    54     download_mm()
  • 相关阅读:
    7.21 高博教育 数组 内存
    【基础扎实】Python操作Excel三模块
    PAT 甲级 1012 The Best Rank
    PAT 甲级 1011  World Cup Betting
    PAT 甲级 1010 Radix
    链式线性表——实验及提升训练
    循环程序设计能力自测
    链表应用能力自测
    PAT 甲级 1009 Product of Polynomials
    1008 Elevator (20分)
  • 原文地址:https://www.cnblogs.com/Jacklovely/p/6513353.html
Copyright © 2020-2023  润新知