• python3爬虫.4.下载煎蛋网妹子图


    开始我学习爬虫的目标 ----> 煎蛋网

    通过设置User-Agent获取网页,发现本该是图片链接的地方被一个js函数代替了

    于是全局搜索到该函数

    function jandan_load_img(b){
        var d = $(b);
        var f = d.next("span.img-hash");
        var e = f.text();
        f.remove();
        var c = jdPwA5ybKhQWGy2rZybAF2StIBxrQ6NvYC(e, "pAqWmGD1GsmY5kVokg1a2eyQ3Shj1Usq");
        var a = $('<a href = "'+c.replace(/(//w+.sinaimg.cn/)(w+)(/.+.(gif|jpg|jpeg))/, "$1large$3")+'" target = "_blank" class = "view_img_link">[查看原图]</a>');
        d.before(a);
        d.before("<br>");
        d.removeAttr("onload");
        d.attr("src", location.protocol+c.replace(/(//w+.sinaimg.cn/)(w+)(/.+.gif)/, "$1thumb180$3"));
        if(/.gif$/.test(c)){
            d.attr("org_src", location.protocol+c);
            b.onload = function(){
                add_img_loading_mask(this, load_sina_gif)
            }
        }
    }
    View Code

    该函数提取span.img-hsah传到另一个函数中,继续查找,有两个定义,于是选择靠后的那个

     1 var jdTzcXZnL0V2WZZ8eq9786xeOdkyoBXlDR=function(m,r,d){
     2     var e="DECODE";
     3     var r=r?r:"";
     4     var d=d?d:0;
     5     var q=4;
     6     r=md5(r);
     7     var o=md5(r.substr(0,16));
     8     var n=md5(r.substr(16,16));
     9     if(q){if(e=="DECODE"){var l=m.substr(0,q)}}
    10     else{var l=""}
    11     var c=o+md5(o+l);
    12     var k;
    13     if(e=="DECODE"){m=m.substr(q);
    14     k=base64_decode(m)}
    15     var h=new Array(256);
    16     for(var g=0;g<256;g++){h[g]=g}
    17     var b=new Array();
    18     for(var g=0;g<256;g++){b[g]=c.charCodeAt(g%c.length)}
    19     for(var f=g=0;g<256;g++){f=(f+h[g]+b[g])%256;
    20     tmp=h[g];
    21     h[g]=h[f];
    22     h[f]=tmp}
    23     var t="";
    24     k=k.split("");
    25     for(var p=f=g=0;
    26     g<k.length;
    27     g++){p=(p+1)%256;
    28     f=(f+h[p])%256;
    29     tmp=h[p];
    30     h[p]=h[f];
    31     h[f]=tmp;
    32     t+=chr(ord(k[g])^(h[(h[p]+h[f])%256]))}
    33     if(e=="DECODE"){if((t.substr(0,10)==0||t.substr(0,10)-time()>0)&&t.substr(10,16)==md5(t.substr(26)+n).substr(0,16)){t=t.substr(26)}
    34     else{t=""}
    35     }
    36     return t
    37 };
    View Code

    参考文章:  http://www.tendcode.com/article/jiandan-meizi-spider/

    其中有对js的函数的改写

    最后代码如下

      1 # -*- coding = UTF-8 -*- 
      2 '''
      3 目标:煎蛋网妹子图
      4 2018/4/22
      5 环境:pyhton3
      6 
      7 '''
      8 
      9 
     10 import urllib.request           #使用url处理包,urllib.request模块是用来打开和读取URLs的
     11 import re                       #使用正则表达式
     12 import hashlib                  #
     13 import base64                   #
     14 from bs4 import BeautifulSoup   #
     15 import time                     #time
     16 import logging                  #log
     17 import sys                      #
     18 
     19 '''
     20 下载单张图片到制定的文件夹下
     21 '''
     22 def load_img(imgurl, file):
     23     name = imgurl.split('/')[-1]
     24     item = urllib.request.urlretrieve('http:'+imgurl,
     25         #'C:\Users\74172\Pictures\jandan2\%s'%(name))
     26         file+'\%s'%(name))
     27     print(name+' is loaded')
     28 
     29 '''
     30 md5加密
     31 '''
     32 def _md5(value):
     33     m = hashlib.md5()
     34     m.update(value.encode('utf-8'))
     35     return m.hexdigest()
     36 
     37 '''
     38 bash64解码
     39 注意 原字符串长度报错问题
     40 '''
     41 def _base64_decode(data):
     42     missing_padding = 4 - len(data) % 4
     43     if missing_padding:
     44         data += '=' * missing_padding
     45     return base64.b64decode(data)
     46 
     47 '''
     48 解密获取图片链接
     49 '''
     50 def get_imgurl(m, r='', d=0):
     51     e = "DECODE"
     52     q = 4
     53     r = _md5(r)
     54     o = _md5(r[0:0 + 16])
     55     n = _md5(r[16:16 + 16])
     56     l = m[0:q]
     57     c = o + _md5(o + l)
     58     m = m[q:]
     59     k = _base64_decode(m)
     60     h = list(range(256))
     61     b = [ord(c[g % len(c)]) for g in range(256)]
     62 
     63     f = 0
     64     for g in range(0, 256):
     65         f = (f + h[g] + b[g]) % 256
     66         tmp = h[g]
     67         h[g] = h[f]
     68         h[f] = tmp
     69 
     70     t = ""
     71     p, f = 0, 0
     72     for g in range(0, len(k)):
     73         p = (p + 1) % 256
     74         f = (f + h[p]) % 256
     75         tmp = h[p]
     76         h[p] = h[f]
     77         h[f] = tmp
     78         t += chr(k[g] ^ (h[(h[p] + h[f]) % 256]))
     79     t = t[26:]
     80     return t
     81 
     82 '''
     83 获取关键字符串
     84 '''
     85 def get_r(js_url):
     86     js_respon = urllib.request.urlopen(js_url)
     87     js = js_respon.read().decode('utf-8')
     88     _r = re.findall('c=[wd]+(e,"(.*?)")', js)
     89     return _r
     90 
     91 '''
     92 获取一个页面的所有图片的链接
     93 '''
     94 def get_urls(url,pages,file):
     95     page = 0
     96     imagNum = 0
     97     headers = {
     98         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
     99         'Host': 'jandan.net'
    100     }
    101     #########################################
    102     while page < pages:
    103         req = urllib.request.Request(url, headers=headers)
    104         respon = urllib.request.urlopen(req)
    105         html = respon.read().decode('utf-8')
    106         ##########################################
    107         js_url = 'http:' + re.findall('<script src="(//cdn.jandan.net/static/min/[wd]+.d+.js)"></script>', html)[-1]
    108         _r = get_r(js_url)[0]
    109         soup = BeautifulSoup(html, 'lxml')
    110         tags = soup.select('.img-hash')
    111         for tag in tags:
    112             img_hash = tag.text
    113             img_url = get_imgurl(img_hash,_r)        
    114             print(imagNum,'------>',img_url)
    115             imagNum = imagNum+1
    116             load_img(img_url,file)
    117         ############################################
    118         nextUrl = re.findall(r'Older Comments" href=".+?.#comments"',html)[0]
    119         print('page#',90-page,'---->done!')
    120         url = 'http:' + nextUrl[22:-1]
    121         page += 1
    122         time.sleep(10)
    123     print('done all!')
    124     print('located---->',file)
    125     
    126 if __name__ == '__main__':
    127     url = 'http://jandan.net/ooxx/'
    128     pages = 1      
    129     file = 'C:\jandan_meizi'
    130     get_urls(url,pages,file)

     //************2018.05.03*******************************

    刚下班回来爬图,发现中间有个报错,

    是get_url一次返回的多张图片链接,于是的改段

    1     for tag in tags:
    2             img_hash = tag.text
    3             img_urls = get_imgurl(img_hash,_r)        
    4             img_urls = re.findall(r'//wx.+?.jpg',img_urls)
    5             for img_url in img_urls:
    6                 print(imagNum,'------>',img_url)
    7                 imagNum = imagNum+1
    8                 load_img(img_url,file)    
    View Code

     //************2018.5.23***********************************

    又报错了,暂时没有解决。

  • 相关阅读:
    正则表达式收藏
    c#mvc实现登录
    jQuery获取URL中的参数
    TortoiseSVN新人使用指南
    CheckBox获取一组及全选
    day45
    day44
    day43
    day43
    day42
  • 原文地址:https://www.cnblogs.com/protogenoi/p/8881182.html
Copyright © 2020-2023  润新知