• [网络]_[批量下载网站文件]



    场景:

    1.有时候需要下载某个网站上提供的所有pdf文件,貌似没发现哟下载工具提供。


    #! encoding=utf-8
    
    import urllib2
    import re
    import os
    
    def Download(url,output):
        print "downloading..."+url
        response = urllib2.urlopen(url)
        resourceFile = open(output,"wb")
        resourceFile.write(response.read())
        resourceFile.close()
        print "downloaded"
    
    def Action(url,ext = "pdf",output = "."):
        
        #1.domain
        index = url.rfind("/");
        domain = url[0:index+1];
        print domain
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        
        #2.content
        content = response.read()
    #    print content
        
        #3.resource
        mode = '"([^"]+'+ext+')"'
        pattern = re.compile(mode)
        strMatch = pattern.findall(content)
        size = len(strMatch)
        print "file num: "+str(size)
        for i in range(0,size,1):
    #        print strMatch[i]
            one = strMatch[i]
            partIndex = one.rfind('/')
            if not one.startswith('http://'):
                if -1!=partIndex:
                    directDir = one[0:partIndex+1]
                else:
                    directDir = ""
    #            print directDir
                try:
                    os.makedirs(output+"/"+directDir)
                except Exception,e:
                    pass
                fileUrl = domain+one
                fileOutput = output+"/"+one
                print fileUrl
                print fileOutput
                Download(fileUrl,fileOutput)
            else:
                print one
                print "........."
                print one[partIndex:]
                fileOutput = output+"/"+one[partIndex:]
                print fileOutput
                Download(one,fileOutput)
        #5.download
    
    if __name__=='__main__':
        print "download"
        url = "http://compgeom.cs.uiuc.edu/~jeffe/teaching/algorithms/";
        Action("http://tech.qq.com/","jpg");
        
        
        
    
        
        
        



  • 相关阅读:
    input只允许输入正整数
    CSS如何作小于1PX的边
    时间戳的处理
    图片转base64上传,视频同理。
    APIcloud微信支付和支付宝支付(方案2,主要在后台进行)
    H5滑条(input type=range)
    checkbox/radio 样式修改
    APIcloud制作APP 微信支付与支付宝支付
    JS获取鼠标左(右)滑事件
    DOM(Document object madle) 文档对象模型: 元素节点 文本节点 属性节点
  • 原文地址:https://www.cnblogs.com/jiangu66/p/3194162.html
Copyright © 2020-2023  润新知