1、环境安装
pip install lxml
2、解析原理
- 使用通用爬虫爬取网页数据
- 实例化etree对象,且将页面数据加载到该对象中
- 使用xpath函数结合xpath表达式进行标签定位和指定数据提取
3、实战案例
- 项目需求:解析房天下新房的相关数据
import requests
import os
from lxml import etree
import json
import csv
if __name__ == '__main__':
url = 'https://huizhou.newhouse.fang.com/house/s/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
if not os.path.exists('./fangtianxiaLibs'):
os.makedirs('./fangtianxiaLibs')
response = requests.get(url=url,headers=headers)
# 手动设置响应数据的编码格式
response.encoding = 'utf-8'
page_text = response.text
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@id = "newhouse_loupai_list"]/ul/li')
# 爬取的数据信息放到列表里面
datas = []
for li in li_list:
# 解析标题
try:
detail_url = li.xpath('.//div[@class="nlcd_name"]/a/@href')[0]
if detail_url != []:
detail_url = 'https:'+detail_url
detail_text = requests.get(url=detail_url,headers=headers).text
# 字符串替换Url后缀
detail_url_new = detail_url.replace('.htm','/housedetail.htm')
tree = etree.HTML(detail_text)
# 解析二级页面的描述和价格(均价)
title = tree.xpath('//div[@class="information"]//div[@class="tit"]/h1/strong/text()')[0]
price = "".join(tree.xpath('//div[@class="information_li mb5"]/div[@class="inf_left fl mr10"]/h3/text() | //div[@class="information_li mb5"]/div[@class="inf_left fl mr10"]/span/text() | //div[@class="information_li mb5"]/div[@class="inf_left fl mr10"]/text()')).strip(' ')
# 二级页面再次发起请求
detail_text_new = requests.get(url=detail_url_new,headers=headers).text
tree_new = etree.HTML(detail_text_new)
# 解析详情页信息
tree_list = tree_new.xpath('//div[@id="Configuration"]')
# print(tree_list[0].xpath('./h3/text()'))
for index in tree_list:
zhoubian = "".join(index.xpath('./h3/text()')).strip(' ')
jiaotong = "".join(index.xpath('./ul[@class="sheshi_zb"]/li/span/text()|./ul[@class="sheshi_zb"]/li[@class="jiaotong_color"]/text()')).strip(' ')
qita = "".join(index.xpath('./ul[@class="sheshi_zb"]/li/span/text()|./ul[@class="sheshi_zb"]/li/text()')).strip(' ')
desc = zhoubian+":"+jiaotong+':'+qita+' '
dic = {
'title':title,
'desc':desc,
'price':price
}
datas.append(dic)
except Exception as msg:
pass
# print('报错原因:{}'.format(msg))
fileName = './fangtianxiaLibs/'+title+'.txt'
print(datas)
title_header = ['title','desc','price']
with open(fileName,'a',encoding='utf-8') as fp:
writer = csv.DictWriter(fp,title_header)
writer.writeheader()
writer.writerows(datas)
- 项目需求:解析图片数据:http://pic.netbian.com/4kmeinv/
import requests
from lxml import etree
url = 'http://pic.netbian.com/4kmeinv/'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
response = requests.get(url=url,headers=headers)
#获取页面原始编码格式
print(response.encoding)
page_text = response.text
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="slist"]/ul/li')
for li in li_list:
img_url = 'http://pic.netbian.com'+li.xpath('./a/img/@src')[0]
img_name = li.xpath('./a/img/@alt')[0]
img_name = img_name.encode('iso-8859-1').decode('gbk')
print(img_url,img_name)
- 项目需求:解析出所有城市名称https://www.aqistudy.cn/historydata/
import requests
from lxml import etree
url = 'https://www.aqistudy.cn/historydata/'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
response = requests.get(url=url,headers=headers)
#获取页面原始编码格式
print(response.encoding)
page_text = response.text
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="bottom"]/ul/li | //div[@class="bottom"]/ul//li')
for li in li_list:
city_name = li.xpath('./a/text()')[0]
city_url = 'https://www.aqistudy.cn/historydata/'+li.xpath('./a/@href')[0]
print(city_name,city_url)
- 项目需求:下载网站站点简历中的图片数据:https://sc.chinaz.com/
import requests
from lxml import etree
import os
# 新建文件夹
if not os.path.exists('./jianliLibs'):
os.makedirs('./jianliLibs')
# 站点第一层 进入简历门户站点
url = 'https://sc.chinaz.com/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
response_text = requests.get(url=url,headers=headers).text
# 解析获取模板信息
tree = etree.HTML(response_text)
# 解析出简历模板Url
def page_index(latest):
for index in range(1,latest):
if index == 1:
muban_url = 'https://sc.chinaz.com' + tree.xpath('//div[@class="nav"]//li[@class="nos no3"]/a/@href')[3]
# print("one",muban_url)
else:
muban_url = 'https://sc.chinaz.com' + tree.xpath('//div[@class="nav"]//li[@class="nos no3"]/a/@href')[3] + 'index_{}.html'.format(index)
# print("two",muban_url)
# 模板简历站点获取每个简历的信息
response = requests.get(muban_url,headers=headers)
# 手动设置响应数据的编码格式
response.encoding = 'utf-8'
muban_text = response.text
# print(muban_text)
# 解析获取简历信息
jianli_tree = etree.HTML(muban_text)
# 解析出简历信息的Url
jianli_url_list = jianli_tree.xpath('//div[@class="main_list jl_main"]//a/@href')
# print(jianli_url_list)
for jianli_url in jianli_url_list:
jianli_url = "https:"+jianli_url
# print(jianli_url)
# 第三层获取简历信息
jianli_detail = requests.get(jianli_url,headers=headers).text
detail_tree = etree.HTML(jianli_detail)
img_src_list = detail_tree.xpath('//div[@class="show_warp jl_warp clearfix"]//img/@src')
for img_src in img_src_list:
img_src = 'https:'+img_src
# print(img_src)
img_src_content = requests.get(img_src,headers=headers).content
# print(img_src_content)
# 生成图片的名称
imgName = img_src.split('/')[-2]
# print(imgName)
# 图片路径
imgPath = './jianliLibs/'+imgName+'.jpg'
# 持久化存储
with open(imgPath, 'wb') as fp:
fp.write(img_src_content)
print('简历:'+imgName, '下载成功!!!')
if __name__ == '__main__':
while True:
try:
values = int(input('请输入站点页分页数:'))
page_index(values)
except Exception as msg:
print('输入错误,错误信息为{}'.format(msg))
finally:
break