from lxml import etree parser=etree.HTMLParser(encoding="utf-8") html=etree.parse("test.html",parser=parser) html2=etree.parse("lagou.html",parser=parser) #html.xpath返回的是列表,大概率都是一个元素的标签 # 1.提取所有tr标签 # trs=html.xpath("//tr") # for tr in trs: # print(etree.tostring(tr,encoding="utf-8").decode("utf-8")) #2.提取第二个tr标签 # tr=html.xpath("//tr[1]")[0] # print(etree.tostring(tr,encoding="utf-8").decode("utf-8")) #3.提取table下border为1px的标签 # border=html.xpath("//table[@border='2px']")[0] # print(etree.tostring(border,encoding="utf-8").decode("utf-8")) #4.获取a标签下href的值 # aList=html2.xpath("//a/@href") # for a in aList: # print(a) #5.获取所有的标签内信息 trs=html.xpath("//tr[position()>1]") positions=[] for tr in trs: herf=tr.xpath(".//a/@href")#.//是在当前位置下找元素,//是全文件 text=tr.xpath("./td[1]//text()")#tr下第一个td里面的text lan=tr.xpath("./td[2]//text()") price=tr.xpath("./td[3]//text()") num=tr.xpath("./td[4]//text()") name=tr.xpath("./td[5]//text()") # alltext=tr.xpath("./td//text()")#第一个tr下所有td下的text position={ "herf":herf, "text":text, "language":lan, "price":price, "num":num, "name":name } false={'herf': [], 'text': [], 'language': [], 'price': [], 'num': [], 'name': []} if position != false: positions.append(position) print(positions)