• 利用斗图啦网站API批量下载表情图片


    decorator.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    
    import logging
    import os
    from functools import wraps
    
    #set the handler string format
    FORMAT = '%(asctime)-15s %(filename)s %(message)s %(imageurl)s %(imagename)s'
    logging.basicConfig(format=FORMAT,level=logging.INFO,filename="biaoqingDownloader.log",datefmt="[%Y-%m-%d %H:%M:%S]")
    my_log_extra={"imageurl" :"","imagename":""}
    
    #downloader logger
    def downloader_logger(func) :
        @wraps(func)
        def wrapper(*args, **kwargs) :
            func(*args,**kwargs)
            try:
                image_name=args[0]
                image_url=args[1]
            except KeyError as e:
                raise e
            my_log_extra["imagename"]=image_name
            my_log_extra["imageurl"]=image_url
            logging.info("biaoqingbaoDownloader downloaded image:",extra=my_log_extra)
        return wrapper
    
    if __name__ == '__main__':
        ## test this logger
        @downloader_logger
        def foo(filename,imageurl) :
            print('logging test')
    
        foo('test.png','www.baidu.com')
        #if no error appears, clear the log file
        with open('./biaoqingDownloader.log','w') as f :
            f.truncate()
    

    downloader.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*
    # @Link    : https://github.com/coderchaser
    
    
    import os
    import time
    import argparse
    import requests
    import random
    import json
    import threading
    from decorator import downloader_logger
    
    # code 0 represents https://www.doutula.com/apidoc
    API_URL_DICT={0:"https://www.doutula.com/api/search?keyword={keyword}&mime={image_type}&page={page}"}
    #random choice from these user agents
    USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
                   'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0',
                   'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0',
                   ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) '
                    'Chrome/19.0.1084.46 Safari/536.5'),
                   ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46'
                    'Safari/536.5'), )
    
    class Downloader(object) :
        def __init__(self,number,keyword,image_type,filepath,verbose,api_code=0) :
            #TODO: self.__image_url_list defined as queue instead of list to support multi thread downloading
            self.__image_url_list = []
            #a list for storing image urls
            self.__number = number
            #the number of images to be downloaded
            self.__api_code = api_code
            #which site's api to choose. Currently only one.
            self.__keyword = keyword
            #keywords of the images
            self.__image_type = image_type
            #image type 0 :all kinds 1:GIF 2:static images
            self.__filepath = filepath
            #where to store those images
            self.__verbose = verbose
            #enable the verbose info?
        def __get_image_url(self) :
            for i in range(1,51) :
                api_url=API_URL_DICT[self.__api_code].format(keyword=self.__keyword,image_type=self.__image_type,page=i)
                try:
                    rq=requests.get(api_url,headers={'User-Agent':random.choice(USER_AGENTS)},timeout=5)
                    response_dict=rq.json()
                    self.__image_url_list.extend([entry['image_url'] for entry in response_dict['data']['list']])
                    #TODO: Can i use multi threads here? This is a kind of IO-intenseive work ?
                    if len(self.__image_url_list) >= self.__number:
                        break
                    if response_dict['data']['more'] != 1:
                        break;
                except requests.ConnectionError as e:
                    print(e)
                finally :
                    rq.close()# close exsists?
    
        def __download(self):
            self.__get_image_url()
            print('Now downloading images from https://www.doutula.com ...')
            if not os.path.exists(self.__filepath):
                os.makedirs(self.__filepath)
            for i in range(self.__number) :
                image_url=self.__image_url_list[i]
                if self.__verbose :
                    print("Dwonload images: {}".format(image_url))
                extension='.'+image_url.split('.')[-1]
                try:
                    filename=os.path.join(self.__filepath,'{0}{1}'.format(self.__keyword,i)+extension)
                    # self.image_download(filename,image_url)
                    download_rq=requests.get(image_url)
                    with open(filename,'wb') as f :
                        f.write(download_rq.content)
                except Exception as e:
                    print(e)
                time.sleep(1)
            print("Images about {} have been downloaded.".format(self.__keyword))
        def run(self) :
            self.__download()
    
        # @downloader_logger
        # def image_download(self,filename,image_url):
        #     download_rq=requests.get(image_url)
        #     with open(filename,'wb') as f :
        #                 f.write(download_rq.content)
    
    
    def get_parser() :
        parser = argparse.ArgumentParser(description="download interesting emoj images from www.doutula.com via command line")
        parser.add_argument('keywords',metavar='KEYWORD',type=str,nargs='*',
            help='the keywords to be searched')
        parser.add_argument('-t','--type',type=int,default=0,choices=range(0,3),
            help='choose image type to be downloaded. 0 represents all, 1 represents GIF , 2 represents static image')
        parser.add_argument('-n','--num',type=int,default=50,
            help='number of images to be downloaded')
        parser.add_argument('-c','--clear',action='store_true',
            help='enable clear the log file')
        parser.add_argument('-d','--dir',type=str,
            help='where to store the images, default is ./tmp/keyword/')
        parser.add_argument('-v','--verbose',action='store_true',
            help='enable show the whole downloading info')
    
        return parser
    
    def download(**kwargs):
        for keyword in kwargs['keywords'] :
            if kwargs['dir'] :
                dirpath=kwargs['dir']+"/"+keyword
            else :
                dirpath='./tmp/'+keyword
            print('Making dir:',dirpath)
            downloader=Downloader(kwargs['num'],keyword,kwargs['type'],dirpath,kwargs['verbose'])
            downloader.run()
    
    def command_line_runner():
        parser=get_parser()
        kwargs=vars(parser.parse_args())
    
        if kwargs['clear']:
            with open('./biaoqingDownloader.log','w') as f:
                f.truncate()
    
        if not kwargs['keywords']:
            #if no keywords assigned, return with help info
            parser.print_help()
            return
    
        download(**kwargs)
    
    
    
    if __name__ == '__main__':
    
        ###
        #test this downloader
        ###
        # downloader=Downloader(20,'金馆长',0,'./tmp',false)
        # downloader.run()
        command_line_runner()
    

    代码以上传至Github,link
    使用方法:python downloader.py -h

    世事茫茫,光阴何其有限!
  • 相关阅读:
    redis client 2.0.0 pipeline 的list的rpop bug
    Python解释器镜像源修改
    全连接层
    测试(张量)- 实战
    数据加载
    Python之微信-微信好友头像合成
    高阶操作
    MYSQL 查询缓存
    SQL Server 查看指定表上的索引
    MYSQL 查看表上索引的 1 方法
  • 原文地址:https://www.cnblogs.com/bobliao/p/9205172.html
Copyright © 2020-2023  润新知