• lxml.html删除节点树和tag对


    # encoding: utf-8
    import StringIO

    from apihelper import info, info_save
    from lxml import etree, html
    from lxml.html.clean import Cleaner


    strhtml = '''
      <html>
       <head>
         <script type="text/javascript" src="evil-site"></script>
         <link rel="alternate" type="text/rss" src="evil-rss">
         <style>
           body {background-image: url(javascript:do_evil)};
           div {color: expression(evil)};
         </style>
       </head>
       <body onload="evil_function()">
         <!-- I am interpreted for EVIL! -->
         <a href="javascript:evil_function()">a link</a>
         <a href="#" onclick="evil_function()">another link</a>
         <p onclick="evil_function()">a paragraph</p>
         <div style="display: none">secret EVIL!</div>
         <object> of EVIL! </object>
         <iframe src="evil-site"></iframe>
         <form action="evil-site">
           Password: <input type="password" name="password">
         </form>
         <blink>annoying EVIL!</blink>
         <a href="evil-site">spam spam SPAM!</a>
         <image src="evil!">
         <div id='nav' class='nav'>this is nav</div>
       </body>
      </html>'''

    # strhtml = '''<html><head></head>
    # <body><div>aaa</div></body>
    # </html>'''

    etree.DEBUG  = 1
    print etree.LIBXML_VERSION
    utf8_parser=html.HTMLParser(encoding='utf8')
    # f = open(xmlFile)
    # xml = f.read()
    # f.close()
    doc=html.fromstring(strhtml, parser=utf8_parser)
    # weather = doc.xpath('body/object')
    # doc.find('.//body/object').drop_tag()
    # doc.find('.//body/object').drop_tree()
    doc.find('.//body').drop_tree()
    # print ' '.join(doc)
    # print info_save(doc)
    # print html.tostring(doc)

  • 相关阅读:
    Eclipse 插件开发 —— 深入理解查找(Search)功能及其扩展点
    Spring Auto Scanning Components
    SSH架构简单总结
    eclipse进行开发
    jasper ireport create a report with parameters without sql query
    VARCHAR2转换为CLOB碰到ORA-22858错误
    cannot find w3wp.exe in VS
    10 things you should know about NoSQL databases
    Notifications Nagios
    Serializable
  • 原文地址:https://www.cnblogs.com/zhang-pengcheng/p/4291496.html
Copyright © 2020-2023  润新知