python 爬虫

爬虫程序主要是把第n层网页的连接也下载下来
主程序
爬虫启动
生成一个队列
f（x）循环队列为空跳出
网址出队列
下载网页找下一层连接
添加到队列

from sys import argv
from os import makedirs,unlink,sep
from os.path import dirname,exists,isdir,splitext
from string import replace ,find,lower
from htmllib import HTMLParser
from urllib import urlretrieve
from urlparse import urlparse,urljoin
from formatter import DumbWriter,AbstractFormatter
from cStringIO import StringIO
import os,sys

syspath=sys.argv[0]


class retri(object):
    def __init__(self,url):
        self.url=url
        self.file=self.filename(url)
        
    def filename(self,url,deffile='index.htm'):
        parsedurl=urlparse(url,'http:',0)
        if parsedurl[2]=='':
            path=parsedurl[1]+'//index.htm'
        else:
            path=parsedurl[1]+parsedurl[2]
        ext=splitext(path)
        if ext[1]=='':
            if path[-1]=='/':
                path+=deffile
            else:
                path+='/'+deffile
        ldir=dirname(path)
#        ldir=path
        if sep !='/':
            ldir =replace(ldir,'/',sep)
        if not isdir(ldir):
            if exists(ldir):
                unlink(ldir)
            makedirs(ldir)
        return path
#        return parsedurl[2]
    
    
    def download(self):
        try:
            retval=urlretrieve(self.url,self.file)
            return retval
        except IOError:
            retval=('*** error:invalid url "%s"'%self.url)
            return retval
            
    def parse_and_getlink(self):
        self.parser=
        (AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
    
    
    
class crawler(object):
    count=0
    def __init__(self,url):
        self.q=[url]
        self.seen=[]
        self.dom=urlparse(url)[1]
    
    def get_page(self,url):
        r=retri(url)
        retval=r.download()
        if retval[0]=='*':
            print retval,'.. skipping parse'
            return
        crawler.count+=1
        print '
(',crawler.count,')'
        print 'url:',url
        print 'file:',retval[0]
        self.seen.append(url)
        
        links=r.parse_and_getlink()
        for eachlink in links:
            if eachlink[:4]!='http' and find(eachlink,'://')==-1:
                eachlink=urljoin(url,eachlink)
            print '* ',eachlink
            
            if find(lower(eachlink),'mailto:')!=-1:
                print '... discarded,mailto link'
                continue
            
            if eachlink not in self.seen:
                if find(eachlink,self.dom)==-1:
                    print '...discarded,not in domain'
                else:
                    if eachlink not in self.q:
                        self.q.append(eachlink)
                        print '...new,added to q'
                    else:
                        print '...discarded,already in q'
                        
            else:
                print '... discarded,already processed'
        
        
        
    def go(self):
        while self.q:
            url=self.q.pop()
            self.get_page(url)
            
            
def main():
    if len(argv)>1:
        url=argv[1]
    else:
        try:
            url=raw_input('enter starting url:')
        except(KeyboardInterrupt,EOFError):
            url=''
    if not url:return
    robot =crawler(url)
    robot.go()
    
if __name__=='__main__':
    main()

相关阅读:
1026: [SCOI2009]windy数 (数位DP）
Codeforces Round #603 (Div. 2)
小明种苹果（续）
1001: [BeiJing2006]狼抓兔子（最小割）
codeforces 990C Bracket Sequences Concatenation Problem
codeforces990D
codeforces 1037D. Valid BFS?
pytorch inception v3 KeyError: <class 'tuple'>解决方法
 codeforces 1025C Plasticine zebra
codeforces1027D
原文地址：https://www.cnblogs.com/frog2008/p/6845306.html