功能:
1.列举一个目录下的文件
2.利用BeautifulSoup简单解析正文内容,然后保存
待完善:
1.多线程支持
2.适配器支持(for雷锋网和36氪两个网站网页)
""" parser for parsing html file from leiphone.com and 36kr.com contact xiaoyang """ # # @author: xiaoyang # @contact: hityixiaoyang@gmail.com # @version: # @describ: parse a html file from leiphone.com # @log: # 1.2012-11-22 create # 2.2012-11-23 add FileCollect and ParseTask class # import sys import urllib2 import codecs import os from bs4 import BeautifulSoup # global def OUT_FILE_PREFIX = "out" OUT_CNT = 0 # FileCollectDBG=False ParseTaskDbg=True def errPrint(code, msg=''): print >> sys.stderr, __doc__ % globals() if msg: print >> sys.stderr, msg sys.exit(code) # for LeiPhone.com def SaveResLP(doc,filename): print "!LOCK!" fp=None try: fp=open(filename,"w") fp.write(doc) except IOError as errStr: errPrint(1, errStr) finally: fp.close() print "!UNLOCK!" return True # foe 36kr.com def SaveRes36K(doc,filename): print "!LOCK!" print "!UNLOCK!" return True class FileCollect: def __init__(self, root): self.root = root self.dlist = [] self.flist = [] def init(self): for root, dirs, files in os.walk(self.root): self.dlist += dirs for afile in files: self.flist.append(root + afile) return True class ParseTask: def __init__(self, savedFileName): self.soup = None self.savedCnt = 0 self.doneCnt = 0 self.savedFileName = savedFileName def parse(self, readFileName): fp = None content = None try: fp = open(readFileName, "r") if fp is not None: self.soup = BeautifulSoup(fp.read()) else: errPrint(1, "fopen failed!") content=self.soup.find_all(id="content_main") self.doneCnt=self.doneCnt+1 if self.doneCnt >= self.savedCnt: SaveResLP(str(content[0]),self.savedFileName) self.doneCnt=0 except IOError as errStr: errPrint(1, errStr) finally: if fp is not None: fp.close() if FileCollectDBG: fc = FileCollect("/opt/project/") fc.init() print "dlist:\r\n", fc.dlist print "flist:\r\n", fc.flist elif ParseTaskDbg: newTask=ParseTask("out.html") newTask.parse("1119-vv-dolby.html") print "saved OK!\r\n"