回顾 bs4
- 实例化bs对象,将页面源码数据加载到该对象中
- 定位标签:find('name',class_='xxx') findall() select()
- 将标签中的文本内容获取 string text get_text() a['href']
xpath
环境安装: pip install lxml
原理解析:
获取页面的源码数据
实例化etree对象,并将页面源码数据加载到该对象中
调用该对象xpath方法进行指定标签的定位
注意:xpath必须结合者xpath的表达式进行标签定位和内容捕获
/html/head/title
//head/title
//title
通过xpath进行获取数据
#项目需求:解析58二手房的相关数据 import requests from lxml import etree url = 'https://bj.58.com/shahe/ershoufang/?utm_source=market&spm=u-2d2yxv86y3v43nkddh1.BDPCPZ_BT&PGTID=0d30000c-0047-e4e6-f587-683307ca570e&ClickID=1' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' } page_text = requests.get(url=url,headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath('//ul[@class="house-list-wrap"]/li') fp = open('58.csv','w',encoding='utf-8') for li in li_list: title = li.xpath('./div[2]/h2/a/text()')[0] price = li.xpath('./div[3]//text()') price = ''.join(price) fp.write(title+":"+price+' ') fp.close() print('over') #调用xpath 返回的是一个列表结构,使用索引
利用xpath处理中文乱码
# ctrl+shift+x # - 解析图片数据:http://pic.netbian.com/4kmeinv/ import requests from lxml import etree import os import urllib url = 'http://pic.netbian.com/4kmeinv/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' } response = requests.get(url=url,headers=headers) #response.encoding = 'utf-8' if not os.path.exists('./imgs'): os.mkdir('./imgs') page_text = response.text tree = etree.HTML(page_text) li_list = tree.xpath('//div[@class="slist"]/ul/li') for li in li_list: img_name = li.xpath('./a/b/text()')[0] #处理中文乱码 img_name = img_name.encode('iso-8859-1').decode('gbk') img_url = 'http://pic.netbian.com'+li.xpath('./a/img/@src')[0] img_path = './imgs/'+img_name+'.jpg' urllib.request.urlretrieve(url=img_url,filename=img_path) print(img_path,'下载成功!') print('over!!!')
#通过encode('iso-8859-1').decode('gbk')编译
#或使用response.encoding = 'utf-8'
xpath在遇到加密base64时解决加密a标签
#【重点】下载煎蛋网中的图片数据:http://jandan.net/ooxx #数据加密 (反爬机制) import requests from lxml import etree import base64 import urllib headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' } url = 'http://jandan.net/ooxx' page_text = requests.get(url=url,headers=headers).text tree = etree.HTML(page_text) img_hash_list = tree.xpath('//span[@class="img-hash"]/text()') for img_hash in img_hash_list: img_url = 'http:'+base64.b64decode(img_hash).decode() img_name = img_url.split('/')[-1] urllib.request.urlretrieve(url=img_url,filename=img_name)
xpath获取两次a标签进行获取及分页判断
#爬取站长素材中的简历模板 import requests import random from lxml import etree headers = { 'Connection':'close', #当请求成功后,马上断开该次请求(及时释放请求池中的资源) 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' } url = 'http://sc.chinaz.com/jianli/free_%d.html' for page in range(1,4): if page == 1: new_url = 'http://sc.chinaz.com/jianli/free.html' else: new_url = format(url%page) response = requests.get(url=new_url,headers=headers) response.encoding = 'utf-8' page_text = response.text tree = etree.HTML(page_text) div_list = tree.xpath('//div[@id="container"]/div') for div in div_list: detail_url = div.xpath('./a/@href')[0] name = div.xpath('./a/img/@alt')[0] detail_page = requests.get(url=detail_url,headers=headers).text tree = etree.HTML(detail_page) download_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href') download_url = random.choice(download_list) data = requests.get(url=download_url,headers=headers).content fileName = name+'.rar' with open(fileName,'wb') as fp: fp.write(data) print(fileName,'下载成功') //*[@id="down"]/div[2]/ul/li[6]/a
xpath 利用 | 实现并集获取数据
#解析所有的城市名称 import requests from lxml import etree headers = { 'Connection':'close', #当请求成功后,马上断开该次请求(及时释放请求池中的资源) 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' } url = 'https://www.aqistudy.cn/historydata/' page_text = requests.get(url=url,headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath('//div[@class="bottom"]/ul/li | //div[@class="bottom"]/ul/div[2]/li') for li in li_list: city_name = li.xpath('./a/text()')[0] print(city_name)
proxies 代理设置
#设置请求的代理ip: www.goubanjia.com 快代理 西祠代理 #代理ip的类型必须和请求url的协议头保持一致 url = 'https://www.baidu.com/s?wd=ip' page_text = requests.get(url=url,headers=headers,proxies={'https':'61.7.170.240:8080'}).text with open('./ip.html','w',encoding='utf-8') as fp: fp.write(page_text)
防卫机制:
robots
UA
数据加密
懒加载
代理ip