使用lxml:
import lxml.etree import lxml.html with open('/tmp/hzh/a.html', 'r') as file: data = file.read() html_str = '<p>hzh。</p> <p> l1</p>' root = lxml.html.fromstring(html_str) # optionally remove tags that are not usually rendered in browsers # javascript, HTML/HEAD, comments, add the tag names you dont want at the end lxml.etree.strip_elements(root, lxml.etree.Comment, "script", "head") # complete text result_str = lxml.html.tostring(root, method="text", encoding='unicode') print(result_str)