• python多线程采集图片


    cmd中运行

    >python untitled2.py    图片的网站

    import requests 
    import threading 
    from bs4 import BeautifulSoup 
    import sys 
    import os 
    
    if len(sys.argv) != 2: 
        print("Usage : " )
        print(" python main.py [URL]" )
        exit(1) 
    # config-start 
    url = sys.argv[1] 
    threadNumber = 20 
    # 设置线程数 # config-end 
    def getContent(url): 
        try: 
            response = requests.get(url) 
            response.raise_for_status() 
            response.encoding = response.apparent_encoding 
            return response.text 
        except Exception  as e:
            print(e)
            return str(e) 
    def getTitle(soup): 
        try: 
            return soup.title.string 
        except: 
            return "UnTitled" 
    def getImageLinks(soup): 
        imgs = soup.findAll("img") 
        result = [] 
        for img in imgs: 
            try: 
                src = img['src'] 
                if src.startswith("http"): 
                    result.append(img['src']) 
                else: 
                    result.append(domain + img['src']) 
            except: 
                continue 
        return result 
    def makeDirectory(dicName): 
        if not os.path.exists(dicName): 
            os.mkdir(dicName)
    def downloadImage(imgUrl,savePath): 
        local_filename = imgUrl.split('/')[-1] 
        local_filename = formatFileName(local_filename) 
        r = requests.get(imgUrl, stream=True) 
        counter = 0 
        if not savePath.endswith("/"): 
            savePath += "/" 
        f = open(savePath + local_filename, 'wb') 
        for chunk in r.iter_content(chunk_size=1024): 
            if chunk: 
                f.write(chunk) 
                f.flush() 
                counter += 1 
        f.close()
    def formatFileName(fileName): 
        fileName = fileName.replace("/","_") 
        fileName = fileName.replace("\","_") 
        fileName = fileName.replace(":","_") 
        fileName = fileName.replace("*","_") 
        fileName = fileName.replace("?","_") 
        fileName = fileName.replace(""","_") 
        fileName = fileName.replace(">","_") 
        fileName = fileName.replace("<","_") 
        fileName = fileName.replace("|","_") 
        fileName = fileName.replace(" ","_") 
        return fileName
    def threadFunction(imgSrc,directoryName): 
        downloadImage(imgSrc,directoryName) 
        
    class myThread (threading.Thread): 
        def __init__(self, imgSrc, directoryName): 
            threading.Thread.__init__(self) 
            self.imgSrc = imgSrc 
            self.directoryName = directoryName 
        def run(self): 
            threadFunction(self.imgSrc, self.directoryName) 
    def getPrefix(url): 
        # http://doamin/xxx.jpg 
        return ''.join(i+"/" for i in url.split("/")[0:4]) 
    def getDomain(url): 
        return ''.join(i+"/" for i in url.split("/")[0:3]) 
    content = getContent(url) 
    prefix = getPrefix(url) 
    domain = getDomain(url) 
    soup = BeautifulSoup(content, "html.parser") 
    images = getImageLinks(soup) 
    title = getTitle(soup) 
    title = formatFileName(title) 
    print(u"页面标题 : " , title )
    print(u"本页图片数量 :",len(images))
    print(u"正在创建文件夹以用来保存所有图片") 
    makeDirectory(title) 
    threads = [] 
    for image in images: 
        print(u"图片地址 : " + image) 
        threads.append(myThread(image, title)) 
    for t in threads: 
        t.start() 
        while True: 
            if(len(threading.enumerate()) < threadNumber): 
                break 
    print(u"所有图片已加入下载队列 ! 正在下载...")
    

    关注公众号:Python爬虫数据分析挖掘,学习更多python知识

    耐得住寂寞,才能登得顶
    Gitee码云:https://gitee.com/lyc96/projects
  • 相关阅读:
    215. Kth Largest Element in an Array(partition逆序排序,index+1 == k)
    220. Contains Duplicate III(核心:set数组有序/桶排序)
    leetcode 772 基本计算器III(包含+-*/ 以及括号) 核心在于递归
    MTK8312 android 4.4 显示虚拟按键区源码修改
    高通android9.0 camera API1底层调用为HAL3而非HAL1
    使用yanzhenjie的Zbar Github项目时4.2版本上找不到so库的问题
    android studio CMake NDK:配置笔记
    android设置系统默认开机时间
    android开发里跳过的坑——GridView使用Glide加载图片不显示
    android系统编译打开系统蓝牙
  • 原文地址:https://www.cnblogs.com/chenlove/p/14038666.html
Copyright © 2020-2023  润新知