爬虫
全网爬取,能爬到1000多张
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
import requests
def save_pic(img_filename, img_data, n):
with open(img_filename, 'wb') as f: # write,read,wb是写入二进制
f.write(img_data)
print('第%s张保存完毕' % (n))
def main():
for n in range(45):
url = f'http://www.xiaohuar.com/list-1-%s.html' % (n + 1)
print(url)
response = requests.get(url)
response.encoding = "gb2312"
data = response.text
# 拿到校花网所有的图片链接
results = re.findall(r'" src="(.*?)" /></a>', data)
name = re.findall(r'" alt="(.*?)" src="', data)
for n in range(len(name)): # type:str
try:
# 判断是不是有链接的
if results[n].startswith('http') and results[n].endswith("jpg"):
img_result = results[n]
# 获取图片内容
img_response = requests.get(img_result)
img_data = img_response.content
img_name = name[n] + ".jpg"
img_filename = r'd:\444\' + img_name
print("条件1")
# 保存图片内容
save_pic(img_filename, img_data, n)
elif results[n].endswith("jpg"):
print(results[n])
img_result = 'http://www.xiaohuar.com/' + results[n]
# 获取图片内容
img_response = requests.get(img_result)
img_data = img_response.content
img_name = name[n]
img_filename = r'd:\55\' + img_name + '.jpg'
# 保存图片内容
save_pic(img_filename, img_data, n)
else:
print('没有可以爬取的信息!')
except:
print('好像哪里出现了异常!')
else:
print('没有可以爬取的信息!')
if __name__ == '__main__':
input('按任意键继续')
main()