• 爬取动物图片源码


    爬取动物图片源码

    import requests
    from bs4 import BeautifulSoup
    import os, shutil
    from threading import Thread
    import time
    from datetime import datetime
    
    def fun_makedir():
        """
        创建文件夹
        """
        file_path = os.getcwd() + '/down/' + time.strftime('%Y%m%d%H%M%S', time.localtime())
        if os.path.exists(file_path):
            shutil.rmtree(file_path)
            os.makedirs(file_path)
        else:
            os.makedirs(file_path)
        os.chdir(file_path)
    
    
    def getmsg(url):
        """
        获取图片缩在页面的链接
        :param url:
        :return:返回一个图片列表:含有图片名称,图片所在页面的链接
        """
        pictrues = []
        response = requests.get(url)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')
        pictrue_list = soup.find('div', id='container').findAll('div', class_='box picblock col3')
        # print(len(pictrue_list))  #80  本来只有40。原因是findAll('div',class_='box picblock col3')这儿之前没有指定class_
        for pictrue in pictrue_list:
            pictrue = pictrue.find('a')
            pictrue_name = pictrue['alt']
            pictrue_url = pictrue['href']
            pictrues.append([pictrue_name, pictrue_url])
            # print("{:<20s}{:<60s}".format(pictrue_name,pictrue_url))
        return pictrues
    
    
    def save_pic(pic_name, pic_url):
        """
        下载图片
        :param pic_name:
        :param pic_url:
        :return:
        """
        global count
        count =count +1
        filename = str(count) + ' ' + pic_name + '.jpg'
        pic = requests.get(pic_url)
        with open(filename, 'ab') as f:
            # f.write(pic)  #这种写法是错误的,一定要加content
            f.write(pic.content)
        print("图片:{}下载成功".format(filename))
    
    def down_pictrue(pictrue_name, pictrue_url):
        """
        获取图片链接,下载图片
        :param pictrue_name:
        :param pictrue_url:
        :return:
        """
        try:
            down_res = requests.get(pictrue_url)
            down_res.encoding = 'utf-8'
            down_soup = BeautifulSoup(down_res.text, 'html.parser')
            down_link = down_soup.find('div', class_='imga').find('a')['href']
            pictrue_url = down_link
            # print(pictrue_name,pictrue_url)
    
            save_pic(pictrue_name, pictrue_url)
        except:
            print("{}未获取到链接".format(pictrue_name))
    
    # 主函数
    def main():
        start_time = datetime.now()
        pictrues = []
        global count #用于统计图片数量
        count=0
    
        url = "http://sc.chinaz.com/tupian/dongwutupian.html"
        for i in range(1, 11):
            if (i == 1):
                url = url
            else:
                url = "http://sc.chinaz.com/tupian/dongwutupian_{}.html".format(i)
            print("collecting message from {}".format(url))
    
            pictrues = getmsg(url)
            threads = []
            for item in pictrues:
                # print(item[0],item[1])
                # 创建多线程,线程执行函数为down_pictrue,传递函数所需参数args=(item[0], item[1])
                t = Thread(target=down_pictrue, args=(item[0], item[1]))
                threads.append(t)
            for t in threads:
                t.start()
            for t in threads:
                t.join()
    
        run_time = (datetime.now() - start_time).total_seconds()
        print("
    一共下载{}张图片,共用时{}秒".format(count,run_time, end='	'))
    
    # 程序入口
    if __name__ == '__main__':
        # 创建文件夹,保存图片
        fun_makedir()
        # 执行主函数
        main()
    
    
  • 相关阅读:
    postman的本地安装教程
    06-Hibernate中的持久化类
    05-Hibernate的核心API及使用c3p0连接池
    04-Hibernate的常见配置
    03-Hibernate的入门
    02-Hibernate的日志记录
    01-Hibernate框架的概述
    15-struts2 提供的异常处理
    14-struts2的表单标签
    13-struts2中json插件使用
  • 原文地址:https://www.cnblogs.com/yuexiao/p/12788165.html
Copyright © 2020-2023  润新知