帮别的院的同学批量下载点图片,并进行简单筛选
1 import requests 2 import re 3 import os 4 from lxml import etree 5 import json 6 7 8 9 def get_html(url, param): 10 headers = { 11 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' 12 } 13 response = requests.get(url, params=param, headers=headers) 14 response.encoding = response.apparent_encoding 15 # 返回json数据,str格式 16 json_str = response.text 17 response.encoding = 'utf-8' 18 return json_str 19 20 21 def parse_page(html): 22 data = json.loads(html, strict=False) 23 objs = data['data'][:-1] 24 urls = [] 25 for obj in objs: 26 url = obj['middleURL'] 27 key = obj['fromPageTitleEnc'] 28 # if '纹枯病' in key: 29 if '全蚀病' in key: 30 urls.append(url) 31 # if '叶锈病' in key: 32 # urls.append(url) 33 # elif '条锈病' in key: 34 # urls.append(url) 35 else: 36 print('该标题被筛选掉: '+key) 37 print(len(urls)) 38 a = len(urls) 39 return urls, a 40 # return url_list 41 42 43 def run(keyword, path): 44 url = "https://image.baidu.com/search/acjson" 45 # https://image.baidu.com/search/acjson?ipn=rj&tn=resultjson_com&word=小麦纹枯病矢量图大图&pn=30 46 i = 0 47 sum_pic = 0 48 for j in range(30, 1800, 30): 49 params = { 50 "ipn": "rj", 51 "tn": "resultjson_com", 52 "word": keyword, 53 "pn": str(j) 54 } 55 html = get_html(url, params) 56 lists, num_pic = parse_page(html) 57 sum_pic += num_pic 58 59 for item in lists: 60 try: 61 img_data = requests.get(item, timeout=10).content 62 with open(path + "/" + str(i) + ".jpg", "wb") as f: 63 f.write(img_data) 64 f.close() 65 i = i+1 66 except requests.exceptions.ConnectionError: 67 print('can not download') 68 continue 69 70 def make_dir(keyword): 71 path = "images/" 72 path = path+keyword 73 is_exists = os.path.exists(path) 74 if not is_exists: 75 os.makedirs(path) 76 return path 77 else: 78 print(path + '目录已存在') 79 return path 80 81 82 def main(): 83 # keyword = '小麦纹枯病矢量图大图' 84 keyword = '小麦全蚀病' 85 path = make_dir(keyword) 86 run(keyword, path) 87 88 89 90 if __name__ == '__main__': 91 main()