1.简介
XPath是一门在XML和HTML文档中查找信息的语言,可以用来在XML和HTML文档中对元素和属性进行遍历
XPath的安装
Chrome插件XPath Helper
点Chrome浏览器右上角:更多工具-----扩展程序-----谷歌商店--------勾选XPath Helper(需要翻墙)
2.语法详解
#1.选取节点 ''' / 如果是在最前面,代表从根节点选取,否则选择某节点下的某个节点.只查询子一辈的节点 /html 查询到一个结果 /div 查询到0个结果,因为根节点以下只有一个html子节点 /html/body 查询到1个结果 // 查询所有子孙节点 //head/script //div . 选取当前节点 .. 选取当前节点的父节点 @ 选取属性 //div[@id] 选择所有带有id属性的div元素 <div id="sidebar" class="sidebar" data-lg-tj-track-code="index_navigation" data-lg-tj-track-type="1"> ''' #2.谓语 ''' 谓语是用来查找某个特定的节点或者包含某个指定的值的节点,被嵌在方括号中。 //body/div[1] body下的第一个div元素 //body/div[last()] body下的最后一个div元素 //body/div[position()<3] body下的位置小于3的元素 //div[@id] div下带id属性的元素 <div id="sidebar" class="sidebar" data-lg-tj-track-code="index_navigation" data-lg-tj-track-type="1"> //input[@id="serverTime"] input下id="serverTime"的元素 模糊匹配 //div[contains(@class,'f1')] div的class属性带有f1的 通配符 * //body/* body下面所有的元素 //div[@*] 只要有用属性的div元素 //div[@id='footer'] //div 带有id='footer'属性的div下的所有div元素 //div[@class='job_bt'] //dd[@class='job-advantage'] 运算符 //div[@class='job_detail'] and @id='job_tent' //book/title | //book/price 选取 book 元素的所有 title 和 price 元素。 也可以百度搜索XPath语法 .//a/text() 当前标签下所有a标签的文字内容 //tr[position()>1 and position()<11] 位置大于1小于11 ''' #需要注意的知识点 ''' 1./和//的区别:/代表子节点,//代表子孙节点,//用的比较多 2.contains有时候某个属性中包含了多个值,那么使用contains函数 //div[contains(@class,'lg')] 3.谓语中的下标是从1开始的,不是从0开始的 '''
3.要在python中使用xpath,要导入一个库 lxml。
这个是C编写的库,直接pip3 install lxml可能会有一些显示问题,但是不影响使用。
然而程序员特有的代码洁癖让我看见波浪线也不会爽,所以去https://www.lfd.uci.edu/~gohlke/pythonlibs/下载lxml的whl文件进行pip(根据自己的pycharm版本选择)
4.lxml和xpath的结合使用
# -*-coding:utf8 -*- from lxml import etree #1.获取所有tr标签 #2.获取第2个tr标签 #3.获取所有class等于even的标签 #4.获取所有a标签的href属性 #5.获取所有的职位信息(纯文本) parser=etree.HTMLParser(encoding='utf-8') html=etree.parse('tencent.html',parser=parser) #1.获取所有tr标签 #xpath函数返回的是一个列表 # trs=html.xpath('//tr') # print(trs) # for tr in trs: # print(etree.tostring(tr,encoding='utf-8').decode('utf-8')) #2.获取第2个tr标签 # trs=html.xpath('//tr[2]')[0] #这样直接找第2个tr标签,实际上会把所有的table下的第二个tr标签找出来, #为了更精准,可以先把table标签找到,再找这个table下的第二个tr标签 # trs=html.xpath('//table[@class="tablelist"]//tr[1]')[0] # print(etree.tostring(trs,encoding='utf-8').decode('utf-8')) #3.获取所有class等于even的标签 # trs=html.xpath("//tr[@class='even']") # for tr in trs: # print(etree.tostring(tr, encoding='utf-8').decode('utf-8')) #4.获取所有a标签的href属性 # a_list=html.xpath('//a/@href') # for a in a_list: # print(a) #5.获取所有的职位信息(纯文本) trs=html.xpath('//tr[position()>1 and position()<11]') positions=[] for tr in trs: #写了//后,则一定会从整个文档找a标签,会无视前面的tr # href=tr.xpath('//a') #写了.后,则获取当前标签下的a标签 href=tr.xpath('.//a/@href')[0] fullurl='http://hr.tencent.com/'+href #title文本信息不是td[1]的直接子元素标签,所以要加./td[1]//text() title=tr.xpath('./td[1]//text()')[0] category=tr.xpath('./td[2]/text()')[0] nums=tr.xpath('./td[3]/text()')[0] address=tr.xpath('./td[4]/text()')[0] pubtime=tr.xpath('./td[5]/text()')[0] position={ 'url':fullurl, 'title':title, 'category':category, 'nums':nums, 'pubtime':pubtime } positions.append(position) # print(positions) #6.获取纯文本信息还可以用string # print(html.xpath("string(//tr[1])")) # trs=html.xpath('//tr') # for tr in trs: # print(tr.xpath("string(.)").strip()
5.实战案例,豆瓣电影爬虫
# -*-coding:utf8 -*- #1.将目标网站上的页面抓取下来 #2.将抓取下来的数据根据一定的规则进行提取 import requests from lxml import etree #1.将目标网站上的页面抓取下来 headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36', 'Referer':'https://www.douban.com/', } url='https://movie.douban.com/' response=requests.get(url,headers=headers) text=response.text html=etree.HTML(text) ul=html.xpath("//ul[@class='ui-slide-content']")[0] # print(etree.tostring(ul,encoding='utf-8').decode('utf-8')) lis=ul.xpath('./li[@data-title]') movies=[] for li in lis: title=li.xpath('@data-title')[0] score=li.xpath('@data-rate')[0] duration=li.xpath('@data-duration')[0] region=li.xpath('@data-region')[0] director=li.xpath('@data-director')[0] actors=li.xpath('@data-actors')[0] thumbnail=li.xpath('.//img/@src')[0] movie={ 'title':title, 'score':score, 'duration':duration, 'region':region, 'director':director, 'actors':actors, 'thumbnail':thumbnail } movies.append(movie) print(movies)
6.实战案例,电影天堂爬虫
# -*-coding:utf8 -*- import requests from lxml import etree # url='https://www.dytt8.net/html/gndy/dyzz/list_23_1.html' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' } # response=requests.get(url,headers=headers) # 在电影天堂的网页中,因为编码方式,requests库猜错了,所以response.text出现乱码 # print(response.text) # text=response.content.decode('gbk') BaseDomain = 'https://www.dytt8.net' def get_detail_url(url): response = requests.get(url, headers=headers) # print(response.encoding) # 默认解码方式ISO-8859-1 # text=response.content.decode('gbk') # 在使用gbk解码时遇到了一些问题,第五页里有特殊字符,无法解析 # 估计是因为xpath默认解码方式和gbk不一致导致的,这时可以直接传requests.text # 因为要获取的是英文字符,不指定解码方式也能得到 html = etree.HTML(response.text) detail_urls = html.xpath('//table[@class="tbspan"]//a/@href') detail_urls = list(map(lambda url: BaseDomain + url, detail_urls)) return detail_urls def parse_detail_page(url): response = requests.get(url, headers=headers) text = response.content.decode('gbk') html = etree.HTML(text) title = html.xpath("//font[@color='#07519a' and position()=1]/text()") zoomE = html.xpath("//div[@id='Zoom']")[0] imgs = zoomE.xpath(".//img/@src") cover = imgs[0] screenshot = imgs[1] infos = zoomE.xpath(".//text()") movie = { 'title': title, 'cover': cover, 'screenshot': screenshot } def parse_info(info, rule): return info.replace(rule, '').strip() for index, info in enumerate(infos): if info.startswith('◎年 代'): info = parse_info(info, '◎年 代') movie['year'] = info elif info.startswith('◎产 地'): info = parse_info(info, '◎产 地') movie['country'] = info elif info.startswith('◎类 别'): info = parse_info(info, '◎类 别') movie['category'] = info elif info.startswith('◎豆瓣评分'): info = parse_info(info, '◎豆瓣评分') movie['douban_rating'] = info elif info.startswith('◎片 长'): info = parse_info(info, '◎片 长') movie['duration'] = info elif info.startswith('◎导 演'): info = parse_info(info, '◎导 演') movie['director'] = info elif info.startswith('◎主 演'): info = parse_info(info, '◎主 演') actors = [] actors.append(info) for x in range(index + 1, len(infos)): actor = infos[x].strip() if actor.startswith('◎简 介'): break actors.append(actor) movie['actors'] = actors elif info.startswith('◎简 介 '): info='' for x in range(index+1,len(infos)): if infos[x].startswith('【下载地址】'): break info = info + infos[x].strip() movie['profile']=info download_url = html.xpath("//td[@bgcolor='#fdfddf']//a/@href")[0] movie['download_url']=download_url return movie def spider(): # url = ['https://www.dytt8.net/html/gndy/dyzz/list_23_%s.html' % i for i in range(1, 8)] base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html' movies=[] for x in range(1, 8): url = base_url.format(x) detail_urls = get_detail_url(url) for detail_url in detail_urls: movie = parse_detail_page(detail_url) movies.append(movie) print(movies) if __name__ == '__main__': spider()
7.实战案例,腾讯招聘爬虫
# -*-coding:utf8 -*- import requests from lxml import etree base_url = 'https://hr.tencent.com/position.php?tid=87&start={}0#a' base_domain = 'https://hr.tencent.com/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' } positions=[] def parse_url(url): detail_urls=[] response=requests.get(url,headers=headers) html=etree.HTML(response.text) trs=html.xpath("//table[@class='tablelist']//tr[position()>1 and position()<12]") for tr in trs: href=tr.xpath('.//a/@href')[0] url=base_domain+href detail_urls.append(url) return detail_urls def parse_detail_page(url): response=requests.get(url,headers=headers) html=etree.HTML(response.text) zoomE=html.xpath('//table[@class="tablelist textl"]')[0] title=zoomE.xpath('.//tr[1]/td/text()')[0] city=zoomE.xpath('.//tr[2]/td[1]/text()')[0] category=zoomE.xpath('.//tr[2]/td[2]/text()')[0] nums=zoomE.xpath('.//tr[2]/td[3]/text()')[0] duty=zoomE.xpath('.//tr[3]//ul//text()') dutys='' for i in duty: dutys=dutys+i.strip() require=zoomE.xpath('.//tr[4]//ul//text()') requires='' for i in require: requires=requires+i.strip() position={ 'title':title, 'city':city, 'category':category, 'nums':nums, 'dutys':dutys, 'requires':requires } return position if __name__ == '__main__': for i in range(1,10): url=base_url.format(i) detail_urls=parse_url(url) for detail_url in detail_urls: position=parse_detail_page(detail_url) positions.append(position) print(position)