Python实现抓取http://www.cssmoban.com/cssthemes网站的模版并下载
实现代码
- # -*- coding: utf-8 -*-
- import urlparse
- import urllib2
- import re
- import os
- import os.path
- URL='http://www.cssmoban.com/cssthemes'
- #全局超时设置
- urllib2.socket.setdefaulttimeout(500)
- #根据url获取内容
- def getUrlContent(url):
- response = urllib2.urlopen(url)
- html = response.read();
- return html
- #获取html中的a标签,且格式是<a target="_blank" href="/showcase/*">的
- def getAllUrl(html):
- return re.findall('<a[\s]+href="/cssthemes/d+.shtml">.*?/a>',html)
- #获取下载文件的标题
- def getDownTitle(html):
- return re.findall('<h1>(.*?)</h1>',html)
- #获取文件下载的url
- def getDownUrl(html):
- return re.findall('<a.*?class="button btn-down".*?/a>',html)
- #获取下一页的url
- def getNextUrl(html):
- return re.findall('<a.*?下一页</a>',html)
- #下载文件
- def download(title,url):
- result = urllib2.urlopen(url).read()
- if os.path.exists("template/")==False:
- os.makedirs("template/")
- newname=("template/"+title.decode('utf-8'))
- newname=newname+'.'+url[url.rfind('.')+1:len(url)]
- open(newname, "wb").write(result)
- #记录日志
- def i(msg):
- fileobj=open('info.log','a')
- fileobj.write(msg+' ')
- fileobj.close();
- print msg
- #记录错误日志
- def e(msg):
- fileobj=open('error.log','a')
- fileobj.write(msg+' ')
- fileobj.close();
- print msg
- if __name__ == '__main__':
- #print getDownUrl('<a href="http://down.cssmoban.com/cssthemes1/cctp_17_jeans.zip" target="_blank" class="button btn-down" title="免费下载"><i class="icon-down icon-white"></i><i class="icon-white icon-down-transiton"></i>免费下载</a>')
- html= getUrlContent(URL)
- i('开始下载:%s' %(URL))
- while True:
- lista= getAllUrl(html);
- #print lista;
- nextPage=getNextUrl(html)
- #print nextPage[0]
- nextUrl=''
- #i('下一页%s'%(nextPage))
- if len(nextPage)<=0:
- e('地址:%s,未找到下一页,程序退出' %(nextPage))
- break;
- nextUrl=nextPage[0]
- nextUrl=URL+'/'+nextUrl[nextUrl.index('href="')+6:nextUrl.index('" target')]
- #print nextPage
- for a in lista:
- downGotoUrl=''
- try:
- #print a.decode('utf-8')
- downGotoUrl=(URL+''+a[a.index('href="')+6:a.index('">')])
- downGotoUrl=downGotoUrl.replace(URL,'http://www.cssmoban.com')
- #print downGotoUrl
- downHtml=getUrlContent(downGotoUrl)
- #print downHtml
- downTitleList= getDownTitle(downHtml)
- downTitle=''
- if len(downTitleList)>0:
- downTitle=downTitleList[0]
- #print downTitle
- downUrlList= getDownUrl(downHtml)
- downUrl=''
- if len(downUrlList)>0:
- downUrl=downUrlList[0]
- downUrl= downUrl[downUrl.index('href="')+6:downUrl.index('" target')]
- #print downUrl
- i('开始下载:%s,文件名:%s' %(downUrl,downTitle))
- download(downTitle,downUrl)
- i('%s下载完成,保存文件名:%s' %(downUrl,downTitle))
- except Exception,e:
- e('地址:%s下载失败,失败信息:' %(downGotoUrl))
- e(str(e))
- i('-----------------------------------------')
- i('执行下一页:%s' %(nextUrl))
- html= getUrlContent(nextUrl)
原文地址:https://blog.csdn.net/wiker_yong/article/details/25844349