• 批量下载新浪博客文章(改进版)


    恩,算是对前段时间写的那个的改进,重写了正则,同时支持翻页下载,还修改了一些bug.但还不支持多线程,打算过几天加上

     1 #!/usr/bin/python
     2 #-*- coding:utf-8 -*-
     3 #****************************
     4 
     5 #author:tmyyss
     6 #version:0.2
     7 
     8 #****************************
     9 
    10 import urllib
    11 import os
    12 import re
    13 
    14 def article_format(usock,basedir):    
    15     title_flag=True
    16     context_start_flag=True
    17     context_end_flag=True
    18     for line in usock:
    19         if title_flag:
    20             title=re.findall(r'(?<=>).+(?=<)',line)
    21             if title:
    22                 title=title[0]
    23                 filename=basedir+title
    24                 print filename
    25                 try:
    26                     fobj=open(filename,'w+')
    27                     fobj.write(title+'
    ')
    28                     title_flag=False
    29                 except IOError,e:
    30                     print "Open %s error:%s"%(filename,e)
    31             else:
    32                 pass
    33         elif context_start_flag:
    34             results1=re.findall(r'(<.+?正文开始.+?>)',line)
    35             if results1:
    36                 context_start_flag=False
    37         elif context_end_flag:
    38             results2=re.findall(r'(<.+?正文结束.+?)',line)
    39             if results2:
    40                 context_end_flag=False
    41                 fobj.write('
    END')
    42                 fobj.close()
    43                 break
    44             else:    
    45                 if 'div' in line or 'span' in line or  '<p>' in line:
    46                     pass
    47                 else:    
    48                     line=re.sub('&#65292;',',',line)
    49                     line=re.sub('&#65306;',':',line)
    50                     line=re.sub('&#65281;','!',line)
    51                     line=re.sub('&#65288;','(',line)
    52                     line=re.sub('&#65289;',')',line)
    53                     line=re.sub('&#8943;','...',line)
    54                     line=re.sub('&#65311;','?',line)
    55                     line=re.sub('&#65307;',';',line)
    56                     line=re.sub(r'<wbr>','',line)
    57                     line=re.sub(r'&nbsp;','',line)
    58                     line=re.sub(r'<brs+?/>','',line)
    59                     fobj.write(line)
    60         else:
    61             print "*****************************************************************"
    62 
    63 def parser_page(pageurl):
    64     total_url=[]
    65     current_page=get_url(pageurl)
    66     total_url.extend(current_page)
    67     usock=urllib.urlopen(pageurl)
    68         context=usock.read()
    69     otherpage=re.findall(r'href.+?跳转',context)
    70         for page in otherpage:
    71                 page=re.findall(r'http.+?html',page)
    72                 pageurl=page[0]
    73         urllist=get_url(pageurl)
    74         total_url.extend(urllist)
    75     return total_url
    76         
    77         
    78 def get_url(pageurl):
    79     urllist=[]
    80     usock=urllib.urlopen(pageurl)
    81         context=usock.read()
    82         raw_url_list=re.findall(r'(<as+title.+?href="http.+?html)',context)
    83     for url in raw_url_list:
    84                 url=re.findall('(http.+?html)',url)[0]
    85         urllist.append(url)
    86     return urllist
    87 
    88 
    89 if __name__=='__main__':
    90     basedir='/home/tmyyss/article/'
    91     if not os.path.exists(basedir):
    92         os.makedirs(basedir)
    93     url_list=parser_page("http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html")
    94     for url in url_list:
    95         article_usock=urllib.urlopen(url)
    96         article_format(article_usock,basedir)
  • 相关阅读:
    计算户数
    日期时间时区函数(Power Query 之 M 语言)
    调用用友u8凭证控件
    用友u8应收,应付,收款,付款导入工具
    用友二次开发之自动调拔
    用友二次开发之发货单辅助工具
    用友t+报表之CS开发
    导出jar包
    论不断变化的词和句
    别说欧式中文!
  • 原文地址:https://www.cnblogs.com/tmyyss/p/4192358.html
Copyright © 2020-2023  润新知