爬虫程序主要是把第n层网页的连接也下载下来
主程序
爬虫启动
生成一个队列
f(x) 循环 队列为空跳出
网址出队列
下载网页 找下一层连接
添加到队列
from sys import argv from os import makedirs,unlink,sep from os.path import dirname,exists,isdir,splitext from string import replace ,find,lower from htmllib import HTMLParser from urllib import urlretrieve from urlparse import urlparse,urljoin from formatter import DumbWriter,AbstractFormatter from cStringIO import StringIO import os,sys syspath=sys.argv[0] class retri(object): def __init__(self,url): self.url=url self.file=self.filename(url) def filename(self,url,deffile='index.htm'): parsedurl=urlparse(url,'http:',0) if parsedurl[2]=='': path=parsedurl[1]+'//index.htm' else: path=parsedurl[1]+parsedurl[2] ext=splitext(path) if ext[1]=='': if path[-1]=='/': path+=deffile else: path+='/'+deffile ldir=dirname(path) # ldir=path if sep !='/': ldir =replace(ldir,'/',sep) if not isdir(ldir): if exists(ldir): unlink(ldir) makedirs(ldir) return path # return parsedurl[2] def download(self): try: retval=urlretrieve(self.url,self.file) return retval except IOError: retval=('*** error:invalid url "%s"'%self.url) return retval def parse_and_getlink(self): self.parser= (AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist class crawler(object): count=0 def __init__(self,url): self.q=[url] self.seen=[] self.dom=urlparse(url)[1] def get_page(self,url): r=retri(url) retval=r.download() if retval[0]=='*': print retval,'.. skipping parse' return crawler.count+=1 print ' (',crawler.count,')' print 'url:',url print 'file:',retval[0] self.seen.append(url) links=r.parse_and_getlink() for eachlink in links: if eachlink[:4]!='http' and find(eachlink,'://')==-1: eachlink=urljoin(url,eachlink) print '* ',eachlink if find(lower(eachlink),'mailto:')!=-1: print '... discarded,mailto link' continue if eachlink not in self.seen: if find(eachlink,self.dom)==-1: print '...discarded,not in domain' else: if eachlink not in self.q: self.q.append(eachlink) print '...new,added to q' else: print '...discarded,already in q' else: print '... discarded,already processed' def go(self): while self.q: url=self.q.pop() self.get_page(url) def main(): if len(argv)>1: url=argv[1] else: try: url=raw_input('enter starting url:') except(KeyboardInterrupt,EOFError): url='' if not url:return robot =crawler(url) robot.go() if __name__=='__main__': main()