• python 爬虫


    爬虫程序主要是把第n层网页的连接也下载下来
    主程序
    爬虫启动
    生成一个队列
    f(x) 循环 队列为空跳出
    网址出队列
    下载网页 找下一层连接
    添加到队列

    from sys import argv
    from os import makedirs,unlink,sep
    from os.path import dirname,exists,isdir,splitext
    from string import replace ,find,lower
    from htmllib import HTMLParser
    from urllib import urlretrieve
    from urlparse import urlparse,urljoin
    from formatter import DumbWriter,AbstractFormatter
    from cStringIO import StringIO
    import os,sys
    
    syspath=sys.argv[0]
    
    
    class retri(object):
        def __init__(self,url):
            self.url=url
            self.file=self.filename(url)
            
        def filename(self,url,deffile='index.htm'):
            parsedurl=urlparse(url,'http:',0)
            if parsedurl[2]=='':
                path=parsedurl[1]+'//index.htm'
            else:
                path=parsedurl[1]+parsedurl[2]
            ext=splitext(path)
            if ext[1]=='':
                if path[-1]=='/':
                    path+=deffile
                else:
                    path+='/'+deffile
            ldir=dirname(path)
    #        ldir=path
            if sep !='/':
                ldir =replace(ldir,'/',sep)
            if not isdir(ldir):
                if exists(ldir):
                    unlink(ldir)
                makedirs(ldir)
            return path
    #        return parsedurl[2]
        
        
        def download(self):
            try:
                retval=urlretrieve(self.url,self.file)
                return retval
            except IOError:
                retval=('*** error:invalid url "%s"'%self.url)
                return retval
                
        def parse_and_getlink(self):
            self.parser=
            (AbstractFormatter(DumbWriter(StringIO())))
            self.parser.feed(open(self.file).read())
            self.parser.close()
            return self.parser.anchorlist
        
        
        
    class crawler(object):
        count=0
        def __init__(self,url):
            self.q=[url]
            self.seen=[]
            self.dom=urlparse(url)[1]
        
        def get_page(self,url):
            r=retri(url)
            retval=r.download()
            if retval[0]=='*':
                print retval,'.. skipping parse'
                return
            crawler.count+=1
            print '
    (',crawler.count,')'
            print 'url:',url
            print 'file:',retval[0]
            self.seen.append(url)
            
            links=r.parse_and_getlink()
            for eachlink in links:
                if eachlink[:4]!='http' and find(eachlink,'://')==-1:
                    eachlink=urljoin(url,eachlink)
                print '* ',eachlink
                
                if find(lower(eachlink),'mailto:')!=-1:
                    print '... discarded,mailto link'
                    continue
                
                if eachlink not in self.seen:
                    if find(eachlink,self.dom)==-1:
                        print '...discarded,not in domain'
                    else:
                        if eachlink not in self.q:
                            self.q.append(eachlink)
                            print '...new,added to q'
                        else:
                            print '...discarded,already in q'
                            
                else:
                    print '... discarded,already processed'
            
            
            
        def go(self):
            while self.q:
                url=self.q.pop()
                self.get_page(url)
                
                
    def main():
        if len(argv)>1:
            url=argv[1]
        else:
            try:
                url=raw_input('enter starting url:')
            except(KeyboardInterrupt,EOFError):
                url=''
        if not url:return
        robot =crawler(url)
        robot.go()
        
    if __name__=='__main__':
        main()
            
            
        
        
  • 相关阅读:
    HDU 2986 Ballot evaluation(精度问题)
    HDU 2985 Another lottery(坑题)
    HDU 2370 Convert Kilometers to Miles
    HDU 2369 Broken Keyboard(字符串)
    ZOJ 2110 Tempter of the Bone(DFS)
    POJ 1151 Atlantis(离散化)
    学习笔记之Python开发环境 IDE ( Anaconda / PyCharm )
    学习笔记之Data Visualization
    学习笔记之Data Science
    学习笔记之人工智能(Artificial Intelligence)
  • 原文地址:https://www.cnblogs.com/frog2008/p/6845306.html
Copyright © 2020-2023  润新知