恩,算是对前段时间写的那个的改进,重写了正则,同时支持翻页下载,还修改了一些bug.但还不支持多线程,打算过几天加上
1 #!/usr/bin/python 2 #-*- coding:utf-8 -*- 3 #**************************** 4 5 #author:tmyyss 6 #version:0.2 7 8 #**************************** 9 10 import urllib 11 import os 12 import re 13 14 def article_format(usock,basedir): 15 title_flag=True 16 context_start_flag=True 17 context_end_flag=True 18 for line in usock: 19 if title_flag: 20 title=re.findall(r'(?<=>).+(?=<)',line) 21 if title: 22 title=title[0] 23 filename=basedir+title 24 print filename 25 try: 26 fobj=open(filename,'w+') 27 fobj.write(title+' ') 28 title_flag=False 29 except IOError,e: 30 print "Open %s error:%s"%(filename,e) 31 else: 32 pass 33 elif context_start_flag: 34 results1=re.findall(r'(<.+?正文开始.+?>)',line) 35 if results1: 36 context_start_flag=False 37 elif context_end_flag: 38 results2=re.findall(r'(<.+?正文结束.+?)',line) 39 if results2: 40 context_end_flag=False 41 fobj.write(' END') 42 fobj.close() 43 break 44 else: 45 if 'div' in line or 'span' in line or '<p>' in line: 46 pass 47 else: 48 line=re.sub(',',',',line) 49 line=re.sub(':',':',line) 50 line=re.sub('!','!',line) 51 line=re.sub('(','(',line) 52 line=re.sub(')',')',line) 53 line=re.sub('⋯','...',line) 54 line=re.sub('?','?',line) 55 line=re.sub(';',';',line) 56 line=re.sub(r'<wbr>','',line) 57 line=re.sub(r' ','',line) 58 line=re.sub(r'<brs+?/>','',line) 59 fobj.write(line) 60 else: 61 print "*****************************************************************" 62 63 def parser_page(pageurl): 64 total_url=[] 65 current_page=get_url(pageurl) 66 total_url.extend(current_page) 67 usock=urllib.urlopen(pageurl) 68 context=usock.read() 69 otherpage=re.findall(r'href.+?跳转',context) 70 for page in otherpage: 71 page=re.findall(r'http.+?html',page) 72 pageurl=page[0] 73 urllist=get_url(pageurl) 74 total_url.extend(urllist) 75 return total_url 76 77 78 def get_url(pageurl): 79 urllist=[] 80 usock=urllib.urlopen(pageurl) 81 context=usock.read() 82 raw_url_list=re.findall(r'(<as+title.+?href="http.+?html)',context) 83 for url in raw_url_list: 84 url=re.findall('(http.+?html)',url)[0] 85 urllist.append(url) 86 return urllist 87 88 89 if __name__=='__main__': 90 basedir='/home/tmyyss/article/' 91 if not os.path.exists(basedir): 92 os.makedirs(basedir) 93 url_list=parser_page("http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html") 94 for url in url_list: 95 article_usock=urllib.urlopen(url) 96 article_format(article_usock,basedir)