• 下载cnblogs所有博客内容


    写了一个python脚本,简单粗暴
    import urllib,os,sys
    
    def getUrlContent(url):
        fp =urllib.urlopen(url)
        cont =fp.read()
        fp.close()
        return cont
        
    # to read blog urls in one page    
    class findBlog:
        def __init__(self,cont):
            self.p=0
            self.cont =cont
        def get(self):
            p1 =self.cont.find('<div class="post">',self.p)
            if p1>0:
                p2 =self.cont.find('<h2><a id="homepage1_HomePageDays',p1)
                if p2>0:
                    p3 =self.cont.find('href="',p2)
                    if p3>0:
                        p4 =self.cont.find('">',p3)
                        if p4>0:
                            url =self.cont[(p3+len('href="')):p4]
                            p5 =self.cont.find('</a>',p4)
                            if p5>0:
                                title =self.cont[(p4+len('">')):p5]
                                self.p =p5
                                return [url, title]
            return None
    
    def logFile(fname, cont):
        if os.path.isfile(fname):
            print fname, 'esist!'
        #    return
        fp=open(fname,'w')
        fp.write(cont)
        fp.close()
        
    def appendFile(fname, cont):
        fp=open(fname,'a')
        fp.write(cont)
        fp.close()
        
    def MyCmd(x):
        print x
        os.system(x)
        
    #to read all blog contents in all pages
    class blogReader:
        def __init__(self):
            self.is_latest_written =0
            self.latest_url =''
            
            #update self.latest_url 
            fname ='cfg.txt'
            if os.path.isfile(fname):
                fp =open(fname,'r')
                self.latest_url =fp.readline().strip()
                fp.close()
                
            print 'latest_url', self.latest_url
        def readPage(self,pid):
            is_latest =0
            cont =getUrlContent('http://www.cnblogs.com/cutepig/default.html?page=%d&OnlyTitle=1'%pid)
            fpLog =open('log.txt','a')
            fb =findBlog(cont)
            print >>fpLog, '--------page', pid
            print '--------page', pid
            while 1:
                ret =fb.get()
                if ret is None: break
                [url, title] =ret
                print >>fpLog, ret
                #print ret    #why cannot print chinses?
                if not self.is_latest_written:
                    logFile('cfg.txt', url)
                    self.is_latest_written =1
                    
                print title.decode('utf-8')
                
                if url==self.latest_url:
                    is_latest =1
                    break
                    
                blogFname =url.replace(';','').replace('&','').replace('?','').replace(':','').replace('/','')+'.htm'
                logFile( blogFname, getUrlContent(url))
                appendFile( 'index2.htm', '<a href=%s>%s</a><br>\n'%(blogFname, title))
                
            fpLog.close()
            MyCmd('copy /y index2.htm+index.htm index.htm')
            return is_latest
            
        def read_all(self):
            pid =1
            while 1:
                is_latest =self.readPage(pid)
                if is_latest:
                    break
                pid =pid+1
            
    br =blogReader()
    br.read_all()
    View Code
  • 相关阅读:
    01 《i》控制字体大小 v-for循环绑定类名 v-bind 结合三目运算 动态添加类
    右侧是长方形和半圆结合 光标放上去在规定时间内完成动画
    04-align-content 它对于当单行是没有效果的
    03-flex-wrap是否换行
    02-align-items的用法
    01--顶部的通告特效---仅显示一条一条滚动
    洛谷P2392 kkksc03考前临时抱佛脚(01背包/搜索)
    蓝桥杯 9大臣的旅费(树的直径)
    蓝桥杯 8买不到的数目(数论/线性DP)
    蓝桥杯 7连号区间数(暴力or并查集(?)
  • 原文地址:https://www.cnblogs.com/cutepig/p/3129534.html
Copyright © 2020-2023  润新知