• python 爬虫


    爬虫程序主要是把第n层网页的连接也下载下来
    主程序
    爬虫启动
    生成一个队列
    f(x) 循环 队列为空跳出
    网址出队列
    下载网页 找下一层连接
    添加到队列

    from sys import argv
    from os import makedirs,unlink,sep
    from os.path import dirname,exists,isdir,splitext
    from string import replace ,find,lower
    from htmllib import HTMLParser
    from urllib import urlretrieve
    from urlparse import urlparse,urljoin
    from formatter import DumbWriter,AbstractFormatter
    from cStringIO import StringIO
    import os,sys
    
    syspath=sys.argv[0]
    
    
    class retri(object):
        def __init__(self,url):
            self.url=url
            self.file=self.filename(url)
            
        def filename(self,url,deffile='index.htm'):
            parsedurl=urlparse(url,'http:',0)
            if parsedurl[2]=='':
                path=parsedurl[1]+'//index.htm'
            else:
                path=parsedurl[1]+parsedurl[2]
            ext=splitext(path)
            if ext[1]=='':
                if path[-1]=='/':
                    path+=deffile
                else:
                    path+='/'+deffile
            ldir=dirname(path)
    #        ldir=path
            if sep !='/':
                ldir =replace(ldir,'/',sep)
            if not isdir(ldir):
                if exists(ldir):
                    unlink(ldir)
                makedirs(ldir)
            return path
    #        return parsedurl[2]
        
        
        def download(self):
            try:
                retval=urlretrieve(self.url,self.file)
                return retval
            except IOError:
                retval=('*** error:invalid url "%s"'%self.url)
                return retval
                
        def parse_and_getlink(self):
            self.parser=
            (AbstractFormatter(DumbWriter(StringIO())))
            self.parser.feed(open(self.file).read())
            self.parser.close()
            return self.parser.anchorlist
        
        
        
    class crawler(object):
        count=0
        def __init__(self,url):
            self.q=[url]
            self.seen=[]
            self.dom=urlparse(url)[1]
        
        def get_page(self,url):
            r=retri(url)
            retval=r.download()
            if retval[0]=='*':
                print retval,'.. skipping parse'
                return
            crawler.count+=1
            print '
    (',crawler.count,')'
            print 'url:',url
            print 'file:',retval[0]
            self.seen.append(url)
            
            links=r.parse_and_getlink()
            for eachlink in links:
                if eachlink[:4]!='http' and find(eachlink,'://')==-1:
                    eachlink=urljoin(url,eachlink)
                print '* ',eachlink
                
                if find(lower(eachlink),'mailto:')!=-1:
                    print '... discarded,mailto link'
                    continue
                
                if eachlink not in self.seen:
                    if find(eachlink,self.dom)==-1:
                        print '...discarded,not in domain'
                    else:
                        if eachlink not in self.q:
                            self.q.append(eachlink)
                            print '...new,added to q'
                        else:
                            print '...discarded,already in q'
                            
                else:
                    print '... discarded,already processed'
            
            
            
        def go(self):
            while self.q:
                url=self.q.pop()
                self.get_page(url)
                
                
    def main():
        if len(argv)>1:
            url=argv[1]
        else:
            try:
                url=raw_input('enter starting url:')
            except(KeyboardInterrupt,EOFError):
                url=''
        if not url:return
        robot =crawler(url)
        robot.go()
        
    if __name__=='__main__':
        main()
            
            
        
        
  • 相关阅读:
    1026: [SCOI2009]windy数 (数位DP)
    Codeforces Round #603 (Div. 2)
    小明种苹果(续)
    1001: [BeiJing2006]狼抓兔子 (最小割)
    codeforces 990C Bracket Sequences Concatenation Problem
    codeforces990D
    codeforces 1037D. Valid BFS?
    pytorch inception v3 KeyError: <class 'tuple'>解决方法
    codeforces 1025C Plasticine zebra
    codeforces1027D
  • 原文地址:https://www.cnblogs.com/frog2008/p/6845306.html
Copyright © 2020-2023  润新知