不废话,直接上代码
# -*- coding: utf-8 -*-
# @Author: Wang Hongbin
# @Email: wanghongbin@ngoos.org
# @Date: 2018-03-16 14:19:27
# @Last Modified by: Wang Hongbin
# @Last Modified time: 2018-03-28 16:26:07
import requests
import re
import os
import time #时间模块
local = time.strftime("%Y-%m-%d_")
baseUrl = "https://cn.bing.com"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
def getImgUrl(url):
reg1 = r"(/az/hprichbg/rb/.*?.jpg)"
con = requests.get(url)
content = con.text
imgUrl = re.findall(reg1, content, re.S)[0]
# imgLink = baseUrl+imgUrl
return imgUrl
def getFilePath():
# filePath = '/var/www/html/biYinPic/images/' + time.strftime("%Y%m%d") + '/'
filePath = 'C:/Users/Administrator/Pictures/MyDesktop/'
if not os.path.exists(filePath):
os.mkdir(filePath)
return filePath
def getImgName(url):
reg2 = r"/az/hprichbg/rb/(.*?)_"
imgName = re.findall(reg2, url, re.S)[0]
imgName = local + imgName + '.jpg'
return imgName
def downloadByPic(url):
imgUrl = getImgUrl(url)
imgName = getImgName(imgUrl)
filePath = getFilePath()
fileName = filePath+imgName
picUrl = baseUrl + imgUrl
read = requests.get(picUrl)
f = open(fileName, 'wb')
f.write(read.content)
f.close()
# reg3 = r'<div class="hplaCata"><div class="hplatt">(.*)</div><div class="hplats">(.*)</div><div id="hplaSnippet">(.*)</div><div class="hplaPvd">(.*)</div>'
downloadByPic(baseUrl)
print('is ok!')
爬取结果
下图是七月份至今的爬取图片,因为是在window上执行的,电脑不开机的时候不会执行,代码放在Linux上执行也没问题,使用crontab启个定时器就行了