# -*- coding: utf-8 -*-
# @Time : 2020/9/16 21:56
# @Author : aqiong
# @Site :
# @File : Xpat图片解析.py
# @Software: PyCharm
import requests
from lxml import etree
import os
import time
import random
user_agent_list = ['Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50','Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36']
def getImg(url,dir):
try:
url = url
headers = {
'user-agent':user_agent_list[random.randint(0,len(user_agent_list)-1)] ,
'Content-Type': 'text / html',
'Server': 'yunjiasu-nginx'
}
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(text=page_text)
li_list = tree.xpath('//div[@class="slist"]/ul/li')
for img_src in li_list:
img_headers = {
'user-agent': user_agent_list[random.randint(0,len(user_agent_list)-1)],
'Content-Type': 'text / html',
'Server': 'yunjiasu-nginx'
}
#print(img_headers)
img_url = 'http://pic.netbian.com'+img_src.xpath('./a/img/@src')[0]
img_name = img_url.split('/')[-1]
print(img_url)
dowloadImg(img_url, img_headers, img_name,dir)
#time.sleep(20)
except:
getImg(url,dir)#之所以这样,是为了防止 出现远程主机强迫关闭了一个现有的连接 这个错误,程序中断。try except就可以一直尝试连接
#下面图片
def dowloadImg(img_url,img_headers,img_name,dir):
try:
img = requests.get(url=img_url, headers=img_headers).content
with open(dir+'/' + img_name, 'wb') as fp: # 二进制不需要encoding=’utf-8‘
fp.write(img)
print('%s保存成功' % img_name)
except:
dowloadImg(img_url,img_headers,img_name,dir)#在成功下载完之前,不断调用自身,尝试进行下载,
if __name__ == '__main__':
url = 'http://pic.netbian.com/4kmeinv/'
dir = './pic'
if not os.path.exists(dir):
os.makedirs(dir)
getImg(url,dir)