from lxml import etree text = """ <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> </ul> </div> """ html = etree.HTML(text) result = etree.tostring(html,pretty_print=True).decode('utf-8') print(result)
from lxml import etree text = """ <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html"><span class="bold">third item</span>></a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> """ # 初始化一个Xpath解析对象 html = etree.HTML(text) # 解析对象输出代码 是一个bytes类型 result = etree.tostring(html,encoding='utf-8') print(type(html)) # <class 'lxml.etree._Element'> print(type(result)) # <class 'bytes'> print(result.decode('utf-8'))