1 # -*- coding:utf-8 -*- 2 # ************************************* 3 # 程序:学习蜘蛛协议的第一个例子 4 # 版本:1.0 5 # 作者:Silence 6 # 语言:Python 2.7 7 # 日期:2014-03-15 8 # 操作:就是下载贴吧里面某个贴吧的所有网页,并且存储为html文件 9 # ************************************* 10 11 import string,urllib2,re 12 from urllib2 import HTTPError 13 14 def baidu_tieba(url,begin_page,end_page): 15 for i in range(begin_page,end_page+1): 16 sName = string.zfill(i,5) + '.html' 17 print '正在下载第',str(i),'个网页,并将其存储为',sName,'.....' 18 try: 19 m = urllib2.urlopen(url + str(i)).read() 20 except HTTPError, e: 21 print '亲,你给的地址出问题了。' 22 if hasattr(e,'reason'): 23 print 'Code:',e.code,';Reason',e.reason 24 pass 25 26 f = open(sName,'w') 27 try: 28 f.write(m) 29 except Exception, e: 30 print '存储网页',sName,'出错!' 31 pass 32 finally: 33 f.close() 34 35 if __name__ == '__main__': 36 bdurl = str(raw_input('请输入贴吧的地址,去掉pn=后面的数字: ')) 37 #因为现在贴吧需要登录上去,并且点击页数才会出现pn=,所以在这里加个判断,自己补全pn= 38 pattern='.+pn=$' 39 m=re.match(pattern,bdurl) 40 if m == None: 41 bdurl += '?pn=' 42 print bdurl 43 begin_page = int(raw_input('请输入开始的页数: ')) 44 end_page = int(raw_input('请输入终点的页数: ')) 45 46 baidu_tieba(bdurl,begin_page,end_page)