from lxml import etree html = etree.parse('./test.html', etree.HTMLParser())## #test.html是html文件,etree.HTMLParser(),解析器 # result = html.xpath('//li')#选取所有的li节点,是一个列表的形式 # print(result) # print(result[0]) # # # #输出结果 # ''' # [<Element li at 0x119b71b88>, <Element li at 0x119b71bc8>, <Element li at 0x119b71c08>, <Element li at 0x119b71c48>, <Element li at 0x119b71c88>] # <Element li at 0x119b71b88> # # ''' # result = html.xpath('//li[@class="item-0"]') # # ''' # 选取当前文档所有属性 # # class的值等于item-0的li标签内容的列表 # # ''' # print(result) # ''' # 输出结果: # [<Element li at 0x1162f0d08>, <Element li at 0x1162f0d48>] # ''' # result = html.xpath('//li[@class="item-0"]/text()') # ''' # 选取当前文档所有属性 # # class的值等于item-0的li标签里面的文本内容 # # ''' # print(result) # # ''' # 输出结果: # [' '] # ''' # result = html.xpath('//li[@class="item-0"]/a/text()') # ''' # 选取当前文档所有属性 # # class的值等于item-0的li标签里面的a标签里面的文本内容 # # ''' # print(result) # # ''' # 输出结果: # ['first item', 'fifth item'] # ''' # result = html.xpath('//li[@class="item-0"]//text()') # print(result) # ''' # 输出结果是:['first item', 'fifth item', ' '] # # ''' # result = html.xpath('//li/a/@href') # print(result) # ''' # 输出结果:['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html'] # # ''' # result = html.xpath('//a[@href="link4.html"]/../@class') # print(result) # ''' # 输出结果:['item-1'] # # ''' # result = html.xpath('//a[@href="link4.html"]/parent::*/@class') # print(result) # ''' # 输出结果: # ['item-1'] # # ''' # # result=html.xpath('//li/@class') # print(result) # ''' # 输出结果:['item-0', 'item-1', 'item-inactive', 'item-1', 'item-0'] # # ''' # result = html.xpath('//li/a') # print(result) # ''' # 输出结果: # <Element a at 0x113e35c88>, <Element a at 0x113e35cc8>, # <Element a at 0x113e35d08>, <Element a at 0x113e35d48>, <Element a at 0x113e35d88>] # ''' # result = html.xpath('//ul//a') # print(result) # ''' # 输出结果:[<Element a at 0x117874c88>, <Element a at 0x117874cc8>, <Element a at 0x117874d08>, # <Element a at 0x117874d48>, <Element a at 0x117874d88>] # # ''' # result = html.xpath('//ul/a') # print(result) # ''' # 输出结果:[] # '''