无比强大！Python抓取cssmoban网站的模版并下载

Python实现抓取http://www.cssmoban.com/cssthemes网站的模版并下载

实现代码

# -*- coding: utf-8 -*-
import urlparse
import urllib2
import re
import os
import os.path
URL='http://www.cssmoban.com/cssthemes'
#全局超时设置
urllib2.socket.setdefaulttimeout(500)
#根据url获取内容
def getUrlContent(url):
response = urllib2.urlopen(url)
html = response.read();
return html
#获取html中的a标签，且格式是<a target="_blank" href="/showcase/*">的
def getAllUrl(html):
return re.findall('<a[\s]+href="/cssthemes/d+.shtml">.*?/a>',html)
#获取下载文件的标题
def getDownTitle(html):
return re.findall('<h1>(.*?)</h1>',html)
#获取文件下载的url
def getDownUrl(html):
return re.findall('<a.*?class="button btn-down".*?/a>',html)
#获取下一页的url
def getNextUrl(html):
return re.findall('<a.*?下一页</a>',html)
#下载文件
def download(title,url):
result = urllib2.urlopen(url).read()
if os.path.exists("template/")==False:
os.makedirs("template/")
newname=("template/"+title.decode('utf-8'))
newname=newname+'.'+url[url.rfind('.')+1:len(url)]
open(newname, "wb").write(result)
#记录日志
def i(msg):
fileobj=open('info.log','a')
fileobj.write(msg+' ')
fileobj.close();
print msg
#记录错误日志
def e(msg):
fileobj=open('error.log','a')
fileobj.write(msg+' ')
fileobj.close();
print msg
if __name__ == '__main__':
#print getDownUrl('<a href="http://down.cssmoban.com/cssthemes1/cctp_17_jeans.zip" target="_blank" class="button btn-down" title="免费下载"><i class="icon-down icon-white"></i><i class="icon-white icon-down-transiton"></i>免费下载</a>')
html= getUrlContent(URL)
i('开始下载：%s' %(URL))
while True:
lista= getAllUrl(html);
#print lista;
nextPage=getNextUrl(html)
#print nextPage[0]
nextUrl=''
#i('下一页%s'%(nextPage))
if len(nextPage)<=0:
e('地址：%s，未找到下一页，程序退出' %(nextPage))
break;
nextUrl=nextPage[0]
nextUrl=URL+'/'+nextUrl[nextUrl.index('href="')+6:nextUrl.index('" target')]
#print nextPage
for a in lista:
downGotoUrl=''
try:
#print a.decode('utf-8')
downGotoUrl=(URL+''+a[a.index('href="')+6:a.index('">')])
downGotoUrl=downGotoUrl.replace(URL,'http://www.cssmoban.com')
#print downGotoUrl
downHtml=getUrlContent(downGotoUrl)
#print downHtml
downTitleList= getDownTitle(downHtml)
downTitle=''
if len(downTitleList)>0:
downTitle=downTitleList[0]
#print downTitle
downUrlList= getDownUrl(downHtml)
downUrl=''
if len(downUrlList)>0:
downUrl=downUrlList[0]
downUrl= downUrl[downUrl.index('href="')+6:downUrl.index('" target')]
#print downUrl
i('开始下载：%s,文件名：%s' %(downUrl,downTitle))
download(downTitle,downUrl)
i('%s下载完成，保存文件名：%s' %(downUrl,downTitle))
except Exception,e:
e('地址：%s下载失败，失败信息：' %(downGotoUrl))
e(str(e))
i('-----------------------------------------')
i('执行下一页：%s' %(nextUrl))
html= getUrlContent(nextUrl)

# -*- coding: utf-8 -*-
import urlparse
import urllib2
import re
import os  
import os.path
URL='http://www.cssmoban.com/cssthemes'
全局超时设置
urllib2.socket.setdefaulttimeout(500)
根据url获取内容
def getUrlContent(url):

response = urllib2.urlopen(url)

html = response.read();

return html
获取html中的a标签，且格式是<a target="_blank" href="/showcase/*">的
def getAllUrl(html):

return re.findall('<a[s]+href="/cssthemes/d+.shtml">.*?/a>',html)
获取下载文件的标题
def getDownTitle(html):

return re.findall('&lt;h1>(.*?)&lt;/h1>',html)
获取文件下载的url
def getDownUrl(html):

return re.findall('<a.?class="button btn-down".?/a>',html)
获取下一页的url
def getNextUrl(html):

return re.findall('<a.*?下一页</a>',html)
下载文件
def download(title,url):

result = urllib2.urlopen(url).read()

if os.path.exists("template/")==False:

os.makedirs("template/")

newname=("template/"+title.decode('utf-8'))

newname=newname+'.'+url[url.rfind('.')+1:len(url)]

open(newname, "wb").write(result)
记录日志
def i(msg):

fileobj=open('info.log','a')

fileobj.write(msg+'
')

fileobj.close();

print msg
记录错误日志
def e(msg):

fileobj=open('error.log','a')

fileobj.write(msg+'
')

fileobj.close();

print msg

if name == 'main':
#print getDownUrl('&lt;a href="http://down.cssmoban.com/cssthemes1/cctp_17_jeans.zip" target="_blank" class="button btn-down" title="免费下载"&gt;&lt;i class="icon-down icon-white"&gt;&lt;/i&gt;&lt;i class="icon-white icon-down-transiton"&gt;&lt;/i&gt;免费下载&lt;/a&gt;')

html= getUrlContent(URL)
i('开始下载：%s' %(URL))
while True:
    lista= getAllUrl(html);
    #print lista;
    nextPage=getNextUrl(html)
    #print nextPage[0]
    nextUrl=''
    #i('下一页%s'%(nextPage))
    
    if len(nextPage)&lt;=0:
        e('地址：%s，未找到下一页，程序退出' %(nextPage))
        break;
    
    nextUrl=nextPage[0]
    nextUrl=URL+'/'+nextUrl[nextUrl.index('href="')+6:nextUrl.index('" target')]
    #print nextPage
    for a in lista:
        downGotoUrl=''
        try:
            #print a.decode('utf-8')
            downGotoUrl=(URL+''+a[a.index('href="')+6:a.index('"&gt;')])
            downGotoUrl=downGotoUrl.replace(URL,'http://www.cssmoban.com')
            #print downGotoUrl
            downHtml=getUrlContent(downGotoUrl)
            #print downHtml
            downTitleList= getDownTitle(downHtml)
            downTitle=''
            if len(downTitleList)&gt;0:
                downTitle=downTitleList[0]
            #print downTitle
            downUrlList= getDownUrl(downHtml)
            downUrl=''
            if len(downUrlList)&gt;0:
                downUrl=downUrlList[0]
            downUrl= downUrl[downUrl.index('href="')+6:downUrl.index('" target')]
            #print downUrl
            i('开始下载：%s,文件名：%s' %(downUrl,downTitle))

            download(downTitle,downUrl)
            i('%s下载完成，保存文件名：%s' %(downUrl,downTitle))
        except Exception,e:
            e('地址：%s下载失败，失败信息：' %(downGotoUrl))
            e(str(e))
            

    i('-----------------------------------------')
    i('执行下一页：%s' %(nextUrl))
    html= getUrlContent(nextUrl)

原文地址：https://blog.csdn.net/wiker_yong/article/details/25844349

相关阅读:
168. 吹气球
 395. 硬币排成线 II
436. 最大正方形
 362. 滑动窗口的最大值(单调队列)
python-网络安全编程第二天（文件操作）
重闯Sqli-labs关卡第一天(1-4关)
python-网络安全编程第一天（requests模块）
PHP代码审计入门(SQL注入漏洞挖掘基础)
PHP代码审计入门(敏感函数回溯参数过程)
PHP核心配置基础解读
原文地址：https://www.cnblogs.com/jpfss/p/9227475.html