爬虫08-xpath语法练习

from  lxml import  etree
parser=etree.HTMLParser(encoding="utf-8")
html=etree.parse("test.html",parser=parser)
html2=etree.parse("lagou.html",parser=parser)
#html.xpath返回的是列表，大概率都是一个元素的标签

# 1.提取所有tr标签
# trs=html.xpath("//tr")
# for tr in trs:
#     print(etree.tostring(tr,encoding="utf-8").decode("utf-8"))

#2.提取第二个tr标签
# tr=html.xpath("//tr[1]")[0]
# print(etree.tostring(tr,encoding="utf-8").decode("utf-8"))

#3.提取table下border为1px的标签
# border=html.xpath("//table[@border='2px']")[0]
# print(etree.tostring(border,encoding="utf-8").decode("utf-8"))

#4.获取a标签下href的值
# aList=html2.xpath("//a/@href")
# for a in aList:
#     print(a)

#5.获取所有的标签内信息
trs=html.xpath("//tr[position()>1]")
positions=[]
for tr in trs:
    herf=tr.xpath(".//a/@href")#.//是在当前位置下找元素，//是全文件
    text=tr.xpath("./td[1]//text()")#tr下第一个td里面的text
    lan=tr.xpath("./td[2]//text()")
    price=tr.xpath("./td[3]//text()")
    num=tr.xpath("./td[4]//text()")
    name=tr.xpath("./td[5]//text()")
  #  alltext=tr.xpath("./td//text()")#第一个tr下所有td下的text

    position={
        "herf":herf,
        "text":text,
        "language":lan,
        "price":price,
        "num":num,
        "name":name
    }
    false={'herf': [], 'text': [], 'language': [], 'price': [], 'num': [], 'name': []}
    if position != false:
        positions.append(position)

print(positions)

相关阅读:
LINK : fatal error LNK1123: 转换到 COFF 期间失败: 文件无效或损坏
RTSP可用网络流
Linux访问Github缓慢
Ubu18.0-NVIDIA显卡驱动重装
FFMPEG第一次学习
QT-守护程序
QT-局域网探测工具(简易版)--Ping
QT-notepad++仿写
Ubuntu 解压文件
Ubuntu -换源

原文地址：https://www.cnblogs.com/wcyMiracle/p/12464907.html