1.解决中文乱码的问题
(1)是否动态加载,
(2)获取源码数据
彼岸图网:
第一页地址:http://pic.netbian.com/4kmeinv/
第二页:http://pic.netbian.com/4kmeinv/index_2.html
第三页:http://pic.netbian.com/4kmeinv/index_3.html
#第一步:我们写的下面的代码有bug,返回的中文有乱码的问题
import requests from lxml import etree headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } start_page=int(input('start page num:')) end_page=int(input('end page num:')) #通用的url模板(不能修改模板) url='http://pic.netbian.com/4kmeinv/index_%d.html)' for page in range(start_page,end_page): if page==1: new_url='http://pic.netbian.com/4kmeinv/' else: new_url=format(url%page) page_text=requests.get(url=new_url,headers=headers).text #解析名称和图片的src属性值 tree=etree.HTML(page_text) li_list=tree.xpath('//div[@class="slist"]/ul/li') for li in li_list: img_name=li.xpath('./a/img/@alt')[0] img_src=li.xpath('./a/img/@src')[0] print(img_name,img_src)
#第二步:修改,下面的结果会有变化,但是结果还是存在问题
import requests from lxml import etree headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } start_page=int(input('start page num:')) end_page=int(input('end page num:')) #通用的url模板(不能修改模板) url='http://pic.netbian.com/4kmeinv/index_%d.html)' for page in range(start_page,end_page): if page==1: new_url='http://pic.netbian.com/4kmeinv/' else: new_url=format(url%page) response=requests.get(url=new_url,headers=headers) response.encoding='utf-8' page_text=response.text #解析名称和图片的src属性值 tree=etree.HTML(page_text) li_list=tree.xpath('//div[@class="slist"]/ul/li') for li in li_list: img_name=li.xpath('./a/img/@alt')[0] img_src=li.xpath('./a/img/@src')[0] print(img_name,img_src)
第三步:我们进一步升级
import requests from lxml import etree headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } start_page=int(input('start page num:')) end_page=int(input('end page num:')) #通用的url模板(不能修改模板) url='http://pic.netbian.com/4kmeinv/index_%d.html)' for page in range(start_page,end_page): if page==1: new_url='http://pic.netbian.com/4kmeinv/' else: new_url=format(url%page) response=requests.get(url=new_url,headers=headers) # response.encoding='utf-8' page_text=response.text #解析名称和图片的src属性值 tree=etree.HTML(page_text) li_list=tree.xpath('//div[@class="slist"]/ul/li') for li in li_list: img_name=li.xpath('./a/img/@alt')[0] img_name=img_name.encode('iso-8859-1').decode('gbk') img_src=li.xpath('./a/img/@src')[0] print(img_name,img_src)
第四步,进一步升级
import requests from urllib import request from lxml import etree import os headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } start_page=int(input('start page num:')) end_page=int(input('end page num:')) #通用的url模板(不能修改模板) #创建文件夹 if not os.path.exists('./meinvs'): os.mkdir('./meinvs') url='http://pic.netbian.com/4kmeinv/index_%d.html' #这个跳转的原始页码要看好. for page in range(start_page,end_page+1): if page==1: new_url='http://pic.netbian.com/4kmeinv/' else: new_url=format(url%page) response=requests.get(url=new_url,headers=headers) # response.encoding='utf-8' page_text=response.text #解析名称和图片的src属性值 tree=etree.HTML(page_text) li_list=tree.xpath('//div[@class="slist"]/ul/li') for li in li_list: img_name=li.xpath('./a/img/@alt')[0] img_name=img_name.encode('iso-8859-1').decode('gbk')+'.jpg' img_src='http://pic.netbian.com'+li.xpath('./a/img/@src')[0] #print('img_src',img_src) img_path='./meinvs/'+img_name #这个路径拼接需要注意下 request.urlretrieve(img_src,img_path) print(img_name,'下载成功!!!')
2.XPATH的另一种用法
爬取全国城市名称
url = 'https://www.aqistudy.cn/historydata/'
import requests from lxml import etree headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } url = 'https://www.aqistudy.cn/historydata/' page_text = requests.get(url=url,headers=headers).text tree = etree.HTML(page_text) # tree.xpath('//div[@class="bottom"]/ul/li/a/text()') #热门城市 #tree.xpath('//div[@class="bottom"]/ul/li/a/text()') #all_city = tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text()') #all_city #一列表形式,打印全部城市 #拿取所有的数据,按位或,xpath直接获取 tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text() | //div[@class="bottom"]/ul/li/a/text()')
站长素材里边的"简历模板"
下载地址可以换着用,解析的时候用每个地址.