#xpath是在xml文档中搜索内容的一门语言 #html是xml的子集 xml = """ <book> <id>1</id> <name>野花遍地⾹</name> <price>1.23</price> <nick>臭⾖腐</nick> <author> <nick id="10086">周⼤强</nick> <nick id="10010">周芷若</nick> <nick class="joy">周杰伦</nick> <nick class="jolin">蔡依林</nick> <div> <nick>惹了</nick> </div> </author> <partner> <nick id="ppc">胖胖陈</nick> <nick id="ppbc">胖胖不陈</nick> </partner> </book> """ from lxml import etree tree=etree.XML(xml) #result=tree.xpath("/book/name") #/表示层级关系,第一个/是根节点 #result=tree.xpath("/book/name/text()") result=tree.xpath("/book/*//nick/text()")#双斜杠表示找后代,*是通配符什么节点都可以 #parse是加载文件 print(result)
#xpath是在xml文档中搜索内容的一门语言 #html是xml的子集 xml = """ <book> <id>1</id> <name>野花遍地⾹</name> <price>1.23</price> <nick>臭⾖腐</nick> <author> <nick id="10086">周⼤强</nick> <nick id="10010">周芷若</nick> <nick class="joy">周杰伦</nick> <nick class="jolin">蔡依林</nick> <div> <nick>惹了</nick> </div> </author> <partner> <nick id="ppc">胖胖陈</nick> <nick id="ppbc">胖胖不陈</nick> </partner> </book> """ from lxml import etree # tree=etree.XML(xml) # #result=tree.xpath("/book/name") #/表示层级关系,第一个/是根节点 # #result=tree.xpath("/book/name/text()") # result=tree.xpath("/book/*//nick/text()")#双斜杠表示找后代,*是通配符什么节点都可以 # #parse是加载文件 # print(result) tree=etree.parse("b.html",etree.HTMLParser()) # result=tree.xpath("/html/body/ol/li/a[@href='dapao']/text()") ol_list=tree.xpath("/html/body/ul/li") for l in ol_list: #从每一个li提取到文字信息 #res=l.xpath("./a/text()")#继续查找 res=l.xpath("./a/@href") print(res) res1=tree.xpath("/html/body/div[1]/text()") print(res1)
上面是xpath的语法,目前来说三种爬虫,xpath应该是最简单的,正则是最万能的,xpath主要对于界面元素可以直接通过复制xpath路径直接获取不需要自己分析源代码