python解析xml文档
1,DOM(基于对象) 主要思想:从根节点开始按照标签值 逐层查找
from xml.dom import minidom
# 打开文件
DomTree = minidom.parse('path')
# 获取xml对象
annotation = DomTree.documentElement
# 获取一级节点 列表返回
folder=annotation.getElementsByTagName("folder")
filename=annotation.getElementsByTagName("filename")
source=annotation.getElementsByTagName("source")
size=annotation.getElementsByTagName("size")
segmented=annotation.getElementsByTagName("segmented")
obj=annotation.getElementsByTagName("object")
# 在obj列表的第一个对象中查找part
part = obj[0].getElementsByTagName("part")
for i in part:
name=i.getElementsByTagName("name")[0].childNodes[0]
bndbox=i.getElementsByTagName("bndbox")[0]
# 例:dom(a标签).childNodes[0] 获取的是<a><x>之间的飞标签字符,包含换行,或其他字符
# dom(a标签).childNodes[1] 即可得到<x></x>标签
"""
<a>
<x></x>
</a>
"""
# dom.getAttribute("id") 可获取属性值
xmin = bndbox.getElementsByTagName("xmin")[0].childNodes[0].data
ymin = bndbox.getElementsByTagName("ymin")[0].childNodes[0].data
xmax = bndbox.getElementsByTagName("xmax")[0].childNodes[0].data
ymax = bndbox.getElementsByTagName("ymax")[0].childNodes[0].data
print(name,bndbox,xmin,ymin,xmax,ymax,type(bndbox.getElementsByTagName("xmin")[0]))
注:
getElementsByTagName()---->class Element
Element.childNodes ------>Dom 类型
2,SAX (基于事件):比dom更多的控制,更有效率,但需要的工作较多
import xml.sax
class AnnotationHandler(xml.sax.ContentHandler):
def ___init__(self):
self.firstAttr=""
self.secAttr=""
def startElement(self,tag,attribute):
pass
def characters(self,content):
pass
def endElement(self,tag):
pass
parser = xml.sax.make_parser()
# 关闭namespaces
parser.setFeature(xml.sax.handler.feature_namespaces,0)
parser.setContentHandler(AnnotationHandler())
parser.parse("path")
详见 https://www.cnblogs.com/hongfei/p/python-xml-sax.html
3,ElementTree 将xml转化为tree
import xml.etree.ElementTree as et
tree = et.parse('path')
root=tree.getroot() # 获取根节点
print(root.tag,root.attrib) # annotation 标签名 , 获取属性
for i in root: # 找到root的一级字标签 也可以root[n] 获取
print(i.tag,i.text)
# i.txt 表示i标签中的非标签值
"""
<i>xx</i> i.text=xx
<i>
<x></x> i.text=换行
</i>
<i><x></x> i.text=None
</i>
"""
xml文件
<annotation>
<folder>VOC2012</folder>
<filename>2007_000027.jpg</filename>
<source>
<database>The VOC2007 Database</database>
<annotation>PASCAL VOC2007</annotation>
<image>flickr</image>
</source>
<size>
<width>486</width>
<height>500</height>
<depth>3</depth>
</size>
<segmented>0</segmented>
<object>
<name>person</name>
<pose>Unspecified</pose>
<truncated>0</truncated>
<difficult>0</difficult>
<bndbox>
<xmin>174</xmin>
<ymin>101</ymin>
<xmax>349</xmax>
<ymax>351</ymax>
</bndbox>
<part>
<name>head</name>
<bndbox>12
<xmin>169</xmin>
<ymin>104</ymin>
<xmax>209</xmax>
<ymax>146</ymax>
</bndbox>
</part>
<part>
<name>hand</name>
<bndbox>
<xmin>278 asd</xmin>
<ymin>210</ymin>
<xmax>297</xmax>
<ymax>233</ymax>
</bndbox>
</part>
<part>
<name>foot</name>
<bndbox>
<xmin>273</xmin>
<ymin>333</ymin>
<xmax>297</xmax>
<ymax>354</ymax>
</bndbox>
</part>
<part>
<name>foot</name>
<bndbox>
<xmin>319</xmin>
<ymin>307</ymin>
<xmax>340</xmax>
<ymax>326</ymax>
</bndbox>
</part>
</object>
</annotation>