• python入门(3)净化雷锋网网页内容


    本程序用于提取网页正文,将内容聚合到一个文件中。使用了多线程、锁、正则表达式、Beautiful Soup开源组件。

    抓下来的6300多个网页处理了大约五分钟。用了8个线程。

    代码如下:

      1 #!/usr/bin/python
      2 """
      3 parser
      4     for parsing html file from leiphone.com and 36kr.com
      5 author
      6     xiaoyang
      7 contact
      8     hityixiaoyang@gmail.com
      9 version
     10     
     11 describe
     12     parse a html file from leiphone.com
     13 log
     14    1.2012-11-22 create
     15    2.2012-11-23 add FileCollect and ParseTask class
     16    3.2012-11-23 add MutiThreads support
     17 """
     18 
     19 import sys
     20 import os
     21 from bs4 import BeautifulSoup
     22 import Queue
     23 import threading
     24 import re
     25 
     26 # for mutithread,you shouldn't change these vars directly
     27 OUT_CNT_LOCK = threading.Lock()
     28 PRINT_LOCK = threading.Lock()
     29 OUT_FILE_PREFIX = "out"
     30 WORKER_NUM = 8
     31 OUT_CNT = 0
     32 MAX_ITEM_CNT = 100
     33 PRINT_DBG = True
     34 
     35 # for debug
     36 FileCollectDBG = False
     37 ParseTaskDbg = False
     38 
     39 # error print and exit ,thread safety 
     40 def errPrint(ifExit=True, msg='_'):
     41     global PRINT_LOCK
     42     try:
     43         if PRINT_LOCK.acquire(10):
     44                 print >> sys.stderr,msg
     45                 if ifExit:
     46                     sys.exit()
     47     finally:
     48         PRINT_LOCK.release()
     49 
     50 # dbg print 
     51 def dbgPrint(msg):
     52     global PRINT_LOCK
     53     if PRINT_LOCK.acquire(10):
     54         print msg
     55     PRINT_LOCK.release()
     56 
     57 import inspect
     58 def lineno():
     59     """Returns the current line number in our program."""
     60     line=inspect.currentframe().f_back.f_lineno
     61     return str(line) 
     62     
     63 # for LeiPhone.com
     64 def SaveResLP(doc, filename, mode="a"):
     65     fp = None
     66     try:
     67         fp = open(filename, mode)
     68         fp.write(doc)
     69     except IOError as errStr:
     70         dbgPrint("lines:"+lineno())
     71         errPrint(True,errStr)
     72     finally:
     73         fp.close()
     74     return True
     75 
     76 # foe 36kr.com
     77 def SaveRes36K(doc, filename):
     78     return True
     79 
     80 class FileCollect:
     81     def __init__(self, root):
     82         if root[len(root)-1] != '\\':
     83             root+="\\"
     84         self.root = root
     85         self.dlist = []
     86         self.fqueue = Queue.Queue(0) 
     87     def init(self):
     88         for root, dirs, files in os.walk(self.root):
     89             self.dlist += dirs
     90             for afile in files: 
     91                 # if file ends with '.html',add it
     92                 if re.search('.html$',afile) is not None:
     93                     self.fqueue.put(root + afile)
     94         return True
     95 
     96 class ParseTask:
     97     def __init__(self, savedFileName=None):
     98         self.soup = None
     99         self.savedFileName = savedFileName
    100     def parse(self, readFileName):
    101         fp = None
    102         content = None
    103         try:
    104             fp = open(readFileName, "r")
    105             if fp is not None:
    106                 self.soup = BeautifulSoup(fp.read())
    107             else:
    108                 msg = "fopen" + readFileName + "failed"
    109                 errPrint(True,msg)
    110             content = self.soup.find("article")
    111             if content is not None:
    112                 #self.soup = BeautifulSoup(str(content))
    113                 # remove other tags
    114                 tag=content.find("p").find("a")
    115                 if not tag:
    116                     return False
    117                 tag.clear()
    118                 
    119                 tag=content.find("footer")
    120                 if not tag:
    121                     return False
    122                 tag.clear()
    123 
    124                 tag=content.find(class_="alipayzone")
    125                 if not tag:
    126                     return False
    127                 tag.clear()
    128                 
    129                 tag=content.find(class_="authorpigtwo")
    130                 if not tag:
    131                     return False
    132                 tag.clear()
    133                 
    134                 tag=content.find(id="jiathis_style_32x32")
    135                 if not tag:
    136                     return False
    137                 tag.clear()
    138                 
    139                 tag=content.find(class_="wumii-hook")
    140                 if not tag:
    141                     return False
    142                 tag.clear()
    143                 
    144                 tag=content.find("center")
    145                 if not tag:
    146                     return False
    147                 tag.clear()
    148                 
    149                 tags=content.find_all(rel="bookmark")
    150                 for tag in tags:
    151                     tag.clear()
    152                 SaveResLP(str(content), self.savedFileName)
    153             else:
    154                 return False
    155             # file handled done 
    156             return True
    157         except IOError as errStr:
    158             errPrint(True,errStr)
    159         except Exception as errStr:
    160             dbgPrint("lines:"+lineno())
    161             errPrint(True,errStr)
    162             #errPrint(True,errStr)
    163         finally:
    164             if fp is not None:
    165                 fp.close()
    166             
    167 # get out filename,thread safety
    168 def newOutName():
    169     global OUT_CNT_LOCK
    170     # block here until get the lock
    171     if(OUT_CNT_LOCK.acquire(10)):
    172         # get the lock
    173         global OUT_CNT
    174         OUT_CNT+=1
    175         filename = str(OUT_FILE_PREFIX) + str(OUT_CNT) + str(".html")
    176         OUT_CNT_LOCK.release()
    177         return filename
    178         
    179         
    180 class TaskThread(threading.Thread):
    181     def __init__(self,tid, tname, queue):
    182         threading.Thread.__init__(self, name=tname)
    183         self.tid=tid
    184         self.queue = queue
    185         self.parserTask = None
    186         self.stop = False
    187         self.savedCnt = 0
    188     def run(self):
    189         outName = newOutName()
    190         self.parserTask = ParseTask()
    191         while not self.stop:
    192             try:
    193                 # if no obj exist,throw exception
    194                 inName = self.queue.get_nowait()
    195                 dbgPrint("handle:" + inName)
    196                 self.parserTask.savedFileName = outName
    197                 if self.parserTask.parse(inName):
    198                     self.savedCnt+=1
    199                     if self.savedCnt > MAX_ITEM_CNT:
    200                         # create new saved file
    201                         outName = newOutName()
    202                         self.savedCnt = 0
    203                 else:
    204                     # parsed failed
    205                     continue
    206             except Queue.Empty:
    207                 self.stop = True
    208                 if self.savedCnt!=0:
    209                     msg = "ethread [" + self.name + "] out:'" + outName + "' with " + str(self.savedCnt) + " items success" 
    210                     errPrint(False,msg)
    211                 else:
    212                     msg = "ethread [" + self.name + "] exit with " + str(self.savedCnt) + " items" 
    213                     errPrint(False,msg)
    214                 return 
    215             except Exception as ex:
    216                 errPrint(True, "lines:"+lineno()+","+ex)
    217                 return 
    218                
    219 # main
    220 def main():
    221     taskThreads={}
    222     #fc = FileCollect("E:\project\python\Parser\page")
    223     fc = FileCollect("F:\myweb\leiphone\web")
    224     print "Start add files..."
    225     fc.init()
    226     print "Added files count:%d" % fc.fqueue.qsize()
    227     print("Starting threads ...")
    228     try:
    229         for tid in range(0,WORKER_NUM):
    230             tobj=TaskThread(tid,"thread-"+str(tid),fc.fqueue)
    231             taskThreads[tid]=tobj
    232             tobj.start()
    233         for tid in range(0,WORKER_NUM):
    234             taskThreads[tid].join()
    235     except Exception as ex:
    236         errPrint(True, ex)    
    237     print('All threads have terminated.')
    238 
    239 if __name__ == '__main__':
    240     main()
    241     afile="03-31-dan-talk-omgpop.html"
    242     if re.search('.html$',afile) is not None:
    243         print "matched!"
    244     else:
    245         print "mismatched!"
    246     if re.search('.jpg$',afile) is not None:
    247         print "matched2!"

    网页原内容:

    净化效果:

  • 相关阅读:
    Java启动工程时,加载固定数据到Map中(不用每次访问数据库)
    Java删除文件夹和其子文件、文件的拷贝和剪切
    EasyExcel导入工具(SpringMVC下使用)
    web工程启动时,在一个类中延迟加载Bean,因为该Bean类可能还没被JVM加载
    ECharts访问后台,JSON格式返回数据实例
    Java 实现缓存,一个线程存,一个线程取
    IE浏览器使用VLC实时显示视频(海康、大华)
    Windows和Linux下 Java开发ping工具类
    Quartz定时器+Spring + @Autowired注入 空指针异常
    ubuntu 安装mysql
  • 原文地址:https://www.cnblogs.com/yixiaoyang/p/2786171.html
Copyright © 2020-2023  润新知