from lxml import etree 2 text = ''' 3 <div> 4 <ul> 5 <li class = "item-0"><a herf = "link1.html">first item</a></li> 6 <li class = "item-1"><a herf = "link2.html">second item</a></li> 7 <li class = "item-inactive"><a herf = "link3.html">third item</a></li> 8 <li class = "item-1"><a herf = "link4.html">fourth item</a></li> 9 <li class = "item-0"><a herf = "link5.html">fifth item</a></li> 10 </ul> 11 </div> 12 ''' 13 html = etree.HTML(text) 14 result = etree.tostring((html))#输出修正后的HTML文本 15 code_all = html.xpath("//*")#选取HTML全部的节点 16 code_li = html.xpath("//li") 17 code_a = html.xpath("//li/a")#选取HTML的li节点的子节点a 18 code_p = html.xpath("//a[@herf = 'link4.html']/../@class")#一直子节点寻找父节点的class属性 19 print(code_p) 20 print(code_li) 21 print("///") 22 print(code_all) 23 print("///") 24 print(code_a) 25 #属性匹配 26 attribute = html.xpath("//li[@class = 'item-0']") 27 print(attribute) 28 #文本获取 29 text = html.xpath("//li/text()") 30 print(text) 31 #属性获取 32 attribute_get = html.xpath("//li/a/@herf") 33 print(attribute_get) 34 #属性多值匹配 35 text1 = """ 36 <li class = "li li-fist"><a href = "link.html">first item</a></li> 37 """ 38 html1 = etree.HTML(text1) 39 attribute_number = html1.xpath("//li[contains(@class,'li')]/a/text()") 40 print(attribute_number) 41 #多属性匹配 42 text2 = """ 43 <li calss = "li li-first" name = "name"><a href = "link.html">first item</a></li> 44 """ 45 html2 = etree.HTML(text2) 46 attribute_text2 = html2.xpath("//li[contains(@calss,'li') and @name = 'name']/a/text()") 47 print(attribute_text2) 48 #按序选择 49 """ 50 有时候,我们在选择的时候某些属性可能同时匹配了多个节点,但是只想要其中某个节点 51 这是可以利用中括号传入索引的方法获取特定次序的节点 52 """ 53 text3 = ''' 54 <div> 55 <ul> <li class = "item-0"><a herf = "link1.html">first item</a></li> 57 <li class = "item-1"><a herf = "link2.html">second item</a></li> 58 <li class = "item-inactive"><a herf = "link3.html">third item</a></li> 59 <li class = "item-1"><a herf = "link4.html">fourth item</a></li> 60 <li class = "item-0"><a herf = "link5.html">fifth item</a></li> 61 </ul> 62 </div> 63 ''' 64 html3 = etree.HTML(text3) 65 result = html3.xpath("//li[1]/a/text()")#选取第一个li节点 66 print(result) 67 result = html3.xpath("//li[last()]/a/text()")#选取左后一个li节点 68 print(result) 69 result = html3.xpath("//li[position() < 3]/a/text()")#选取位置小于三的节点 70 print(result) 71 #节点轴选取 72 result = html3.xpath("//li[1]/ancestor::*")#获取所有祖先节点,后跟*表示匹配所有节点 73 print(result) 74 result = html3.xpath("//li[1]/ancestor::div")#获取div这个祖先节点 75 print(result) 76 result = html3.xpath("//li[1]/attribute::*")#获取所有属性 77 print(result) #运行结果 ['item-1'] [<Element li at 0x7f72f489c888>, <Element li at 0x7f72f489c948>, <Element li at 0x7f72f489c9c8>, <Element li at 0x7f72f489ca08>, <Element li at 0x7f72f489ca88>] /// [<Element html at 0x7f72f489c808>, <Element body at 0x7f72f489c788>, <Element div at 0x7f72f489c748>, <Element ul at 0x7f72f489c848>, <Element li at 0x7f72f489c888>, <Element a at 0x7f72f489c908>, <Element li at 0x7f72f489c948>, <Element a at 0x7f72f489c988>, <Element li at 0x7f72f489c9c8>, <Element a at 0x7f72f489c8c8>, <Element li at 0x7f72f489ca08>, <Element a at 0x7f72f489ca48>, <Element li at 0x7f72f489ca88>, <Element a at 0x7f72f489cac8>] /// [<Element a at 0x7f72f489c908>, <Element a at 0x7f72f489c988>, <Element a at 0x7f72f489c8c8>, <Element a at 0x7f72f489ca48>, <Element a at 0x7f72f489cac8>] [<Element li at 0x7f72f489c888>, <Element li at 0x7f72f489ca88>] [] ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html'] ['first item'] ['first item'] ['first item'] ['fifth item'] ['first item', 'second item'] [<Element html at 0x7f72f489cdc8>, <Element body at 0x7f72f489cec8>, <Element div at 0x7f72f489cf48>, <Element ul at 0x7f72f489cf08>] [<Element div at 0x7f72f489cf48>] ['item-0']