• 爬虫_百度图片下载


    帮别的院的同学批量下载点图片,并进行简单筛选

     1 import requests
     2 import re
     3 import os
     4 from lxml import etree
     5 import json
     6 
     7 
     8 
     9 def get_html(url, param):
    10     headers = {
    11         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
    12     }
    13     response = requests.get(url, params=param, headers=headers)
    14     response.encoding = response.apparent_encoding
    15     # 返回json数据,str格式
    16     json_str = response.text
    17     response.encoding = 'utf-8'
    18     return json_str
    19 
    20 
    21 def parse_page(html):
    22     data = json.loads(html, strict=False)
    23     objs = data['data'][:-1]
    24     urls = []
    25     for obj in objs:
    26         url = obj['middleURL']
    27         key = obj['fromPageTitleEnc']
    28         # if '纹枯病' in key:
    29         if '全蚀病' in key:
    30             urls.append(url)
    31         # if '叶锈病' in key:
    32         #     urls.append(url)
    33         # elif '条锈病' in key:
    34         #     urls.append(url)
    35         else:
    36             print('该标题被筛选掉: '+key)
    37     print(len(urls))
    38     a = len(urls)
    39     return urls, a
    40     # return url_list
    41 
    42 
    43 def run(keyword, path):
    44     url = "https://image.baidu.com/search/acjson"
    45     # https://image.baidu.com/search/acjson?ipn=rj&tn=resultjson_com&word=小麦纹枯病矢量图大图&pn=30
    46     i = 0
    47     sum_pic = 0
    48     for j in range(30, 1800, 30):
    49         params = {
    50                     "ipn": "rj", 
    51                     "tn": "resultjson_com", 
    52                     "word": keyword, 
    53                     "pn": str(j)
    54                 }
    55         html = get_html(url, params)
    56         lists, num_pic = parse_page(html)
    57         sum_pic += num_pic
    58   
    59         for item in lists:
    60             try:
    61                 img_data = requests.get(item, timeout=10).content
    62                 with open(path + "/" + str(i) + ".jpg", "wb") as f:
    63                     f.write(img_data)
    64                     f.close()
    65                 i = i+1
    66             except requests.exceptions.ConnectionError:
    67                 print('can not download')
    68                 continue
    69 
    70 def make_dir(keyword):
    71     path = "images/"
    72     path = path+keyword
    73     is_exists = os.path.exists(path)
    74     if not is_exists:
    75         os.makedirs(path)
    76         return path
    77     else:
    78         print(path + '目录已存在')
    79         return path
    80 
    81 
    82 def main():
    83     # keyword = '小麦纹枯病矢量图大图'
    84     keyword = '小麦全蚀病'
    85     path = make_dir(keyword)
    86     run(keyword, path)
    87 
    88 
    89 
    90 if __name__ == '__main__':
    91     main()
  • 相关阅读:
    2011/6/24 数据库分析
    项目代码总结
    背景透明 by sofish
    ie6 reflow bug
    ID与CLASS的使用技巧
    CSS浮动属性Float详解 by 帕兰
    javascript闭包 by 李松峰
    详解CSS选择器、优先级与匹配原理
    垂直对齐:verticalalign属性 by ddcatlee
    行高lineheight,以及基线、顶线、中线和底线,还有内容区域、行内框和行框 by 豆豆猫的窝
  • 原文地址:https://www.cnblogs.com/MC-Curry/p/9614019.html
Copyright © 2020-2023  润新知