• python 单线程图片下载


    import urllib.request
    import urllib.parse
    import urllib.error
    import re
    import os
    import ssl
    
    ssl._create_default_https_context = ssl._create_unverified_context
    
    path = "./images"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
        "referer": "https://www.mzitu.com/xinggan/"
    }
    
    
    def handler_request(url, pageIndex):
        url = url + str(pageIndex)
        # 构建请求对象
        request = urllib.request.Request(url=url, headers=headers)
    
        return request
    
    
    def get_images_url(content, basePath):
        patternNames = re.compile(r'<li>.*?<img .* alt=(.*?) .*? />.*?')
        patternHrefs = re.compile(r'<li><a href=(.*?) .*?>.*?')
        alts = patternNames.findall(content, re.S)
        hrefs = patternHrefs.findall(content, re.S)
        image_map = {}
        for i in range(len(hrefs)):
            key = alts[i][1: len(alts[i]) - 1]
            image_map[key] = hrefs[i]
    
        for item in image_map.items():
            image_category_response(item, basePath)
    
    
    def image_category_response(item, basePath):
        alt = item[0]
        save_folder = os.path.join(basePath, alt)
        if not os.path.exists(save_folder):
            os.mkdir(save_folder)
    
        baseurl = item[1][1: len(item[1]) -1]
        pageCount = 1000
        try:
            for pageIndex in range(pageCount):
                page_url = baseurl + "/" + str(pageIndex)
                try:
                    # 构建请求对象
                    request = urllib.request.Request(url=page_url, headers=headers)
                    # 发送请求
                    response = urllib.request.urlopen(request)
                    content = response.read().decode()
                    imgPattern = re.compile(r'<div class="main-image"><p>.*?<img src=(.*?) .*? />.*?')
                    imgUrl = imgPattern.findall(content, re.S)
                    download_images(imgUrl[0], save_folder)
                except urllib.error.URLError as e:
                    raise TypeError("最大页面数{0}".format(pageIndex - 1))
        except Exception as e:
            print(e)
    
    
    def download_images(url, save_path):
        url = url[1: len(url) - 1]
        print(url)
        # 构建请求对象
        request = urllib.request.Request(url=url, headers=headers)
        # 发送请求
        response = urllib.request.urlopen(request)
    
        filename = url.split('/')[-1]
        with open(os.path.join(save_path, filename), 'wb') as fb:
            fb.write(response.read())
    
    
    def parse_pages(content):
        print(content)
    
    
    def main():
        url = 'https://www.mzitu.com/xinggan/page/'
        start_page = int(input("请输入起始页码:"))
        end_page = int(input("请输入结束页码:"))
        # 创建根文件夹
        if not os.path.exists(path):
            os.mkdir(path)
    
        for pageIndex in range(start_page, end_page + 1):
    
            print("...........开始下载第{0}页".format(pageIndex))
            # 创建文件夹
            save_path = create_folder(pageIndex)
            # 生成request
            request = handler_request(url, pageIndex)
            # 发送请求对象,获取相应内容
            response = urllib.request.urlopen(request)
            content = response.read().decode()
            # 解析内容,提取图片并且下载
            get_images_url(content, save_path)
    
            print("...........结束下载第{0}页".format(pageIndex))
    
    
    def create_folder(pageIndex):
        save_path = os.path.join(path, str(pageIndex))
        if not os.path.exists(save_path):
            os.mkdir(save_path)
    
        return save_path.replace("\", "/") + "/"
    
    
    if __name__ == "__main__":
        main()
  • 相关阅读:
    浅谈Java两种并发类型——计算密集型与IO密集型
    设置线程池的大小
    Java 四种线程池newCachedThreadPool,newFixedThreadPool,newScheduledThreadPool,newSingleThreadExecuto
    gitlab的简单操作
    GitHub vs GitLab:区别?
    前端小知识汇总
    花里胡哨的CSS集锦
    码云如何上传代码
    小程序自定义底部导航
    Vue实践过程中的几个问题
  • 原文地址:https://www.cnblogs.com/KruceCoder/p/12076682.html
Copyright © 2020-2023  润新知