#!/usr/bin/python 2 import urllib 3 import urllib2 4 import re 5 import os 6 7 dirs = ['js','img','pay','css'] 8 urls = ['http://www.xxxxxx.net/' + x for x in dirs] 9 10 def parse(baseurl): 11 url_hand = urllib2.urlopen(baseurl) 12 url_cont = url_hand.read() 13 urllist = re.findall("<A HREF=".*">",url_cont) 14 files = [] 15 dirs = [] 16 cwd = os.getcwd() 17 for x in urllist: 18 xx = x.split(""")[1] 19 if re.search(".*/$",xx): 20 dirs.append(xx) 21 nextpath = os.path.join(cwd, xx) 22 else: 23 files.append(xx) 24 dirs.remove(dirs[0]) 25 26 27 for xfile in files: 28 xfileurl = "http://www.xxxxxx.net" + xfile 29 #todir = os.path.join(pardir, os.path.dirname(xfile)) 30 todir = cwd + xfile 31 print todir 32 urllib.urlretrieve(xfileurl, todir) 33 for xdir in dirs: 34 todir = cwd + xdir 35 try: 36 os.mkdir(todir) 37 except OSError, e: 38 print "dir exist!!" 39 xdirurl = "http://www.xxxxxx.net" + xdir 40 print xdirurl 41 parse(xdirurl) 42 43 44 if __name__ == "__main__": 45 for url in urls: 46 parse(url)
知识点:
1.这个站点有autoindex,所以进入目录后自动列出里面的文件,将其爬出,分类,文件,和目录
对于文件,直接抓取。
对于目录,得到路径后对其调用函数递归抓取。
2.下载文件,可以使用urllib模块的urlretrieve
3.还可以使用urlopen->read->write to file