主要是想把某一个网页中的所有链接都保存下来,便于离线查看。使用python语言为了练手,看了挺多教程,却很少实战。
import urllib2
import re
import os
import time
master_url = 'http://www.catb.org/esr/faqs/hacker-howto.html'
master_page_name = 'How to become a hacker.html'
start = time.time()
master_page = urllib2.urlopen(master_url).read()
urls_patt = re.compile(r'href="(https?://[^#]+?)"') # re for hrefs
urls = re.findall(urls_patt,master_page)
dir_pages = './Pages'
if not os.path.exists(dir_pages):
os.mkdir(dir_pages)
for i in range(len(urls)):
path_sub = os.path.join(dir_pages,str(i)+'.html')
try:
content_sub = urllib2.urlopen(urls[i]).read()
except:
print "Error:Can't fetch",path_sub
try:
fout = open(path_sub,'w')
fout.write(content_sub)
fout.close()
print 'Done:',path_sub
except:
print 'Error:write',urls[i]
master_page = master_page.replace(urls[i].strip(),path_sub)
try:
fout = open(master_page_name,'w')
fout.write(master_page)
fout.close()
except:
print 'Error:write',master_page_name
time_cost = time.time() - start
print 'Time cost:',time_cost,'s'