1.数据解析
1.数据解析的作用
可以帮助我们实现聚焦爬虫
2.数据解析的实现方式
- 正则
- bs4
- xpath
- pyquery
3.数据解析的通用原理
问题:1.聚焦爬虫爬取的数据是存储的在哪里
都被存储在了相关的标签之中和相关标签的属性中
1.定位标签
2.取文本或者取属性
requests模块与urllib模块的区别
# 爬取图片
# requests模块 import requests headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' }
url = "http://www.xiaohuar.com/d/file/20190814/48f590249bcbc6a82aec5bf2a63ae54f.jpg"
img_data = requests.get(url,headers=headers).content #byte类型数据 with open("tupiao.jpg","wb") as fp: fp.write(img_data)
# urllib模块 from urllib import request url = "http://pic25.nipic.com/20121112/9252150_150552938000_2.jpg" request.urlretrieve(url,filename="./tu.jpg") # urllib模块不能使用UA伪装
2.数据解析的实现方式
1.正则
正则相关知识点:https://www.cnblogs.com/zangyue/p/12044575.html
# 爬起糗图1-3页所有的图片 # 使用通过爬虫将前3页对应的页面源码数据进行爬取 import re import os import requests from urllib import request disname = "./imgLibs" # 判断文件是否存在 if not os.path.exists(disname): os.mkdir(disname) url = 'https://www.qiushibaike.com/pic/page/%d/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' } for page in range(1, 4): new_url = format(url % page) # 每一个页面对应的页面源码数据 page_text = requests.get(url=url, headers=headers).text # 在通用爬虫的基础上实现聚焦爬虫(每一个页面对应页面源码数据中解析出图片地址) # 正则表达式 ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?<div>' img_scr_list = re.findall(ex, page_text, re.S) for src in img_scr_list: src = "https" + src img_name = src.split('/')[-1] img_path = disname + "/" + img_name request.urlretrieve(src, filename=img_path) print(img_name, "下载成功")
2.bs4 解析
1.bs4解析的原理
- 实例化一个BeautifulSoup的对象,需要将即将被解析的页面源码数据加载到该对象
- 调用BeautifulSoup对象中的相关方法和属性进行标签定位和数据提取
2.环境的准备
pip3 install bs4
pip3 install lxml
3.BeautifulSoup实例化
# BeautifulSoup(fp,'lxml'); 将本地存储的一个html文档中的数据加载到实例化好的BeautifulSoup对象中 # BeautifulSoup(page_text,'lxml'); 将从互联网上获取的页面源码数据加载到实例化好的BeautifulSoup对象中
4.定位标签的操作
# 定位标签的操作 标签定位: soup.tagname # 定位到第一个出现的tagname标签 属性定位: soup.find("tagname", attrname="value") 属性定位: soup。find_all("tagname", attrname="value") # 返回值了列表 选择器定位:soup.select("选择器") 层级选择器: > 表示一个层级,空格表示多个层级 # 取文本 .string:获取直系的文本内容 .text: 获取所有的文本内容 # 取属性 tagname["attrname"]
定位标签操作的实例:
from bs4 import BeautifulSoup fp = open("./text.html", "r", encoding="utf-8") soup = BeautifulSoup(fp, "lxml") # 定位到第一个出现的tagname标签 print(soup.div) # 属性定位:soup.find('tagName',attrName='value') print(soup.find("div", class_="song")) print(soup.find("a", id="feng")) # 属性定位:soup.find_all('tagName',attrName='value') print(soup.find_all("div", class_="song")) # 选择器定位:soup.select("选择器") print(soup.select("#feng")) # > 表示一个层级 soup.select('.tang > ul > li') # 空格表示多个层级 soup.select('.tang li') # 取文本 .string: 获取直系的文本内容 a_tag = soup.select("#fend")[0] print(a_tag.string) .text: 获取所有的文本内容 div = soup.div print(div.text) div = soup.find("div", class_="song") print(div.text) # 取属性 a_tag = soup.select("#feng")[0] print(a_tag["href"])
基于bs4爬取实例
# 需求 爬取三国整篇内容(章节+章节内容) http://www.shicimingju.com/book/sanguoyanyi.html import requests from bs4 import BeautifulSoup headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' } fp = open("./sanguo.txt", "w", encoding="utf-8") main_url = "http://www.shicimingju.com/book/sanguoyanyi.html" page_text = requests.get(url=main_url, headers=headers).text # 解析出章节名称和章节详情页的url soup = BeautifulSoup(page_text, "lxml") a_list = soup.select(".book-mulu>ul>li>a") # 返回的是列表中存储的是一个个li标签 for a in a_list: title = a.string data_url = "http://www.shicimingju.com" + a["href"] data_page_text = requests.get(url=data_url, headers=headers).text # 解析详情页中的章节内容 soup = BeautifulSoup(data_page_text, "lxml") content = soup.find('div', class_="chapter_content").text fp.write(title + ":" + content + " ") print(title, "下载成功") fp.close()
3 xpath 解析
1.xpath解析实现原理
1.实例化一个etree对象,然后将即将被解析的页面源码加载到改对象中
2.使用etree对象中的xpath方法结合不同形式的xpath表达式实现标签定位和数据提提取
2.环境配置
pip3 install lxml
3.etree对象的实例化
etree.parse("test.html") etree.HTML(page_text)
4.xpath表达式
# xpath 表达式:xpath方法的返回值一定是一个列表 - 最左侧的/表示: xpath表达式一定从根标签进行标签查找和定位 - 最左侧的//表示:xpath表达式可以从任意位置定位标签 - 非最左侧的/:表示一个层级 - 非最左侧的//:表示跨多个层级 - 属性定位://tagname[@attrname="value"] - 索引定位://tagname[index] # 索引是从1开始 # 取文本 /text():直系文本内容 //text():所有的文本内容 # 取属性 /@attrname
xpath表达式实例
from lxml import etree tree = etree.parse('./test.heml') # 最左侧的/表示:xpath表达式一定要从根标签逐层进行标签查找和定位 tree.xpath('/html/head/title') # 最左侧的//表示:xpath表达式可以从任意位置定位标签 tree.xpath("//title") tree.xpath('//p') # 非最左侧的/表示:一个层级 tree.xpath() # 非最左侧的//表示: 表示多个层级 tree.xpath('html/body//p') # 属性定位://tanName[@attrName="value"] tree.xpath('//div[@class="song"]') # 索引定位://tagName[index] 索引是1开始 tree.xpath('//li[7]') # 取文本: /text(): 直系文本内容 tree.xpath('//a[@id="feng"]/text()')[0] //text():所有的文本内容 tree.xpath('//div[@class="song"]//text()') # 去属性 tree.xpath('//a[@id="feng"]/@href')
xpath爬虫实例
# 需求 爬取糗事百科中的段子内容和作名称 url = 'https://www.qiushibaike.com/text/' page_text = requests.get(url,headers=headers).text
# 解析内容 tree = etree.HTML(page_text) div_list = tree.xpath('//div[@id="content-left"]/div') for div in div_list: author = div.xpath('./div[1]/a[2]/h2/text()')[0] # 实现局部解析 content = div.xpath('./a[1]/div/span/text()') content = ''.join(content) print(author,content)
import os import requests from lxml import etree dirname = "./meinvlibs" if not os.path.exists(dirname): os.mkdir(dirname) url = "http://pic.netbian.com/4kmeinv/index_%d.html" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' } for page in range(1, 3): if page == 1: new_url = 'http://pic.netbian.com/4kmeinv/' else: new_url = format(url % page) page_text = requests.get(url=new_url, headers=headers).text tree = etree.HTML(page_text) a_list = tree.xpath('//div[@class="slist"]/ul/li/a') for a in a_list: img_src = "http://pic.netbian.com" + a.xpath("./img/@src")[0] img_name = a.xpath("./b/text()")[0] img_name = img_name.encode('iso-8859-1').decode('gbk') # 解决乱码 img_data = requests.get(url=img_src, headers=headers).content imgPath = dirname + '/' + img_name + '.jpg' with open(imgPath, "wb") as fp: fp.write(img_data) print(img_name, '下载成功!!!')
import requests from lxml import etree headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' } page_text = requests.get('https://www.aqistudy.cn/historydata/', headers=headers).text tree = etree.HTML(page_text) cities = tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()') # 提高xpath的通用性 print(cities)
# http://sc.chinaz.com/jianli/free.html 免费的简历模板进行爬取和保存 """ 从这里爬取免费的简历模板 第一页:'http://sc.chinaz.com/jianli/free.html' 第其它页:f'http://sc.chinaz.com/jianli/free_{i}.html' """ import os import requests from lxml import etree dirName = "./resume" if not os.path.exists(dirName): os.mkdir(dirName) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', 'Connection': 'close' } for i in range(1, 11): if i == 1: new_url = "http://sc.chinaz.com/jianli/free.html" else: new_url = f"http://sc.chinaz.com/jianli/free_{i}.html" page_text = requests.get(url=new_url, headers=headers).text tree = etree.HTML(page_text) a_list = tree.xpath('//div[@id="container"]//a[@class="title_wl"]') for a in a_list: title = a.xpath('./text()')[0] title = title.encode('iso-8859-1').decode('utf8') print(title) detail_path = a.xpath('./@href')[0] page_detail = requests.get(url=detail_path, headers=headers).text tree = etree.HTML(page_detail) download_url = tree.xpath('//ul[@class="clearfix"]/li[1]/a/@href')[0] filename = dirName + '/' + title + '.jpg' img_data = requests.get(url=download_url, headers=headers).content with open(filename, "wb") as fp: fp.write(img_data)
在爬取数据量大的时候会报一个错误:
HttpConnectinPool错误 原因: 1.短时间内发起高频的请求导致ip被禁 2.http连接池中的连接资源被耗尽 解决方法: 1.代理 2.headers中加Conection:"close"
待续