• Python_解压zip以及upload到hdfs


    压缩

    使用zipfile解压和
    使用linux自带的zip
    

    示例

    #!/usr/bin/env python3
    # -*- coding: UTF-8 -*-
    
    import pyhdfs
    import zipfile
    import os
    import os.path
    import pandas as pd
    
    def unzip_file(path_pair):
        file = path_pair[0]
        target = path_pair[1]
        print("正在解压%s,解压目录%s"%(file,target))
        try:
            with zipfile.ZipFile(file,mode="a") as f:
                f.extractall(target)  # 将文件解压到指定目录
        except Exception as e:
            print("异常:%s"%e)
        finally:
            f.close()
    
    def unzip_file_gbk(zip_path_pair,target_dir):
        print("正在解压%s,解压目录%s"%(zip_path_pair,target_dir))
        try:
            with zipfile.ZipFile(zip_path_pair,mode="a") as f:
                for num, f_name in enumerate(f.namelist()):
                    new_f_name = f_name.encode("cp437").decode("gbk")
                    f.extract(f_name,path=target_dir)
                    os.rename(os.path.join(target_dir,f_name), os.path.join(target_dir,new_f_name))
        except Exception as e:
            print("异常:%s"%e)
        finally:
            f.close()
    
    
    def unzip_file(src_leaf_dir,target_dir):
        zip_file_name = []
        [zip_file_name.append(os.path.join(src_leaf_dir, file_name)) for file_name in os.listdir(src_leaf_dir)
         if file_name.lower().endswith("zip")]
        for num, zip_file in  enumerate(zip_file_name):
            datset_name = os.path.split(zip_file)[-1].split(".")[0]
            print(num,datset_name)
            unzip_file_gbk(zip_file,target_dir)
    
    def get_dir_stats(file_dir):
        result = {}
        for dirpath,dirnames,filenames in os.walk(file_dir):
            file_count = 0
            for file in filenames:
                file_count = file_count + 1
            result_sig = {dirpath:file_count}
            result.update(result_sig)
        return result
    
    def get_satify_stats_dir(unzip_target_dir):
        result_list =[]
        for num, zip_file in  enumerate(os.listdir(unzip_target_dir)):
            datset_name = zip_file
            new_dir = os.path.join(unzip_target_dir,datset_name)
            file_dir_nm = get_dir_stats(new_dir)
            # max函数要求第一个参数是可迭代内容,这里我们的dict
            #第二个参数是一个函数,对迭代的每一项进行处理,将处理 后的结果统一起来进行比较大小,
            # 返回大的一项的原数据
            max_key = max(file_dir_nm, key= file_dir_nm.get)
            sig_result = [zip_file,max_key,file_dir_nm.get(max_key)]
            result_list.append(sig_result)
        return result_list
    
    if __name__ == "__main__":
        # client = pyhdfs.HdfsClient(hosts="test",user_name="test")
        # 解压缩
        # zip_src_leaf_dir= r"D:\data\test\01"
        # unzip_target_dir = r'D:\data\test\data_unzip'
        # unzip_file(zip_src_leaf_dir,unzip_target_dir)
        unzip_dir = r'D:\data\test\data_unzip'
        src_dest = get_satify_stats_dir(unzip_dir)
        meta_file_nm =r"D:\data\test\group_result_01.txt"
        meta_df = pd.read_csv(meta_file_nm,sep="\t",encoding="utf8")
        #satisfy_df = meta_df[meta_df["集"] == src_dest[0]]
        file_res_ls =[]
        for set_data in src_dest:
            satisfy_df = meta_df[meta_df["集"] == set_data[0]]
            satify_result = (set_data[0],set_data[1],set_data[2],satisfy_df["hdfs_dir"].values[0],satisfy_df["new_label"].values[0])
            print(satify_result)
            file_res_ls.append(satify_result)
        res_df = pd.DataFrame(file_res_ls,columns=["data_nm","src_dir","cnt","hdfs_dir","new_label"])
        res_df.to_csv(r"D:\data\test\group_result_hdfs.txt",index=False,header=True,sep="\t")
        #     out.write("\t".join(satify_result) +"\n")
        #     file_result_all.append(satify_result)
        # out.close()
        # print(file_result_all)
    

    HDFS文件上传

     使用自带的工具
     使用pyhdfs
    

    代码示例

    import pyhdfs
    import os.path
    
    
    if __name__ == "__main__":
        client = pyhdfs.HdfsClient(hosts="test",user_name="test")
        file_nm = r"G:\data\test\group_result_hdfs.txt"
        with open(file=file_nm,mode='r',encoding="utf8") as f:
        # 使用next函数 跳过首行
            first_line = next(f)
            for file_num, data in enumerate(f):
                steList = data.strip().replace("\r","").replace("\n","").split("\t")
                local_dir = steList[1]
                hdfs_dir = steList[3]
                if not client.exists(hdfs_dir) :
                    print("创建文件夹",hdfs_dir)
                    client.mkdirs(hdfs_dir)
                if client.exists(hdfs_dir):
                    print("upload",local_dir,hdfs_dir)
                    for num,local_file in enumerate(os.listdir(local_dir)):
                        local_src_jpg =   os.path.join(local_dir,local_file)
                        hdfs_src_jpg =  hdfs_dir +"/"+local_file
                        client.copy_from_local(local_src_jpg,hdfs_src_jpg)
                        print(file_num,num,local_src_jpg,hdfs_src_jpg)
                else:
                    print(client.exists(hdfs_dir))
                print("done")
    

    参考

     https://pyhdfs.readthedocs.io/en/latest/pyhdfs.html   
     python操作hdfs https://www.cnblogs.com/wangbin2188/p/14591230.html
  • 相关阅读:
    OPENSSL库使用--AES篇
    Linux inotify功能及实现原理
    LSI RAID
    Linux下关于热插拔硬盘的指令
    最长回文字串理解(学习Manacher's algorithm)
    pat 1068 动态规划/Fina More Conis
    (二)Myeclipse中关于jdk配置,解决版本不一致问题
    (一)MyEclipse配置Tomcat,与jsp程序运行
    pat 1047 解题心得
    pat 1038 Smallest Number解题心得
  • 原文地址:https://www.cnblogs.com/ytwang/p/15608978.html
Copyright © 2020-2023  润新知