#!/usr/bin/python # coding:utf8 import os import chardet import sys import traceback import logging # 遍历文件 def get_all_file_path(path, all_file_path): """ :param path: 指定的扫描路径 :param all_file_path: 保存各个文件的路径 :return: """ if not os.path.isdir(path): print "%s该文件路径不存在"%(path) return [] filelist = os.listdir(path) for filename in filelist: filepath = os.path.join(path, filename) # 递归:判断文件路径是不是文件夹,如果时继续调用该函数 if os.path.isdir(filepath): get_all_file_path(filepath, all_file_path) else: all_file_path.append(filepath) return all_file_path # 转码 def imp_file_encode(file_path, final_file_name, target_code): """ :param file_path: 要转化的文件名及路径 :param final_file_name: 转化成功的文件保存到指定的文件 :return: boolean :target_code: 指定的目标编码 """ try: # 读文件 file_obj = open(file_path, 'r') # 获取文件内容 file_content = file_obj.read() # 判断文件内容的编码格式 file_code = chardet.detect(file_content) # 解码并转码(必须写解码,才能够转码) gbk_file_content = file_content.decode(file_code['encoding']).encode(target_code) file_obj.close() with open(final_file_name, 'wb') as fp: fp.write(gbk_file_content) return True except Exception: traceback.print_exc() return False # 改进后转码函数,在读取大文件时会正常转码 def file_encode(file_path, final_file_name, target_code): """ :param file_path: 要转化的文件名及路径 :param final_file_name: 转化成功的文件保存到指定的文件 :return: boolean :target_code: 指定的目标编码 """ try: # 读文件 file_code = chardet.detect(final_file_name) # 路径名解码 if file_code['encoding'] == None: file_code['encoding'] = 'utf-8' final_file_name = final_file_name.decode(file_code['encoding'], 'ignore').encode(target_code, 'ignore') target_file_name = final_file_name.split("/")[-1] target_dir = final_file_name.replace(target_file_name, '') print target_dir if not os.path.isdir(target_dir): try: os.makedirs(target_dir) except Exception, e: print "Can not create dir:", e file_obj = open(file_path, 'rb') target_file_obj = open(final_file_name, 'awb') sequence = 0 while True: # 获取文件内容 一次读取大概1M的数据量,否则可能造成及其卡顿,影响正常使用 print "正在转化..........", sequence sequence+=1 file_content = file_obj.read(1000000) #如果读取的内容为空就终止循环 if file_content == '': break # 判断文件内容的编码格式 file_code = chardet.detect(file_content) # 解码并转码(必须先解码,才能够转码) if file_code['encoding'] == None: file_code['encoding'] = 'utf-8' unicode_file_content = file_content.decode(file_code['encoding'], 'ignore') target_file_content = unicode_file_content.encode(target_code, 'ignore') target_file_obj.write(target_file_content) file_obj.close() target_file_obj.close() return True except Exception: traceback.print_exc() return False # 示例:转化为gbk if __name__ == '__main__': if not os.path.isdir('gbk_file_data'): try: os.mkdir('gbk_file_data') except Exception, e: print "Can not create dir:", e if len(sys.argv) == 2: all_file_path = get_all_file_path(sys.argv[1], []) else: logging.error("Please input file path!") exit(1) for file_path in all_file_path: if file_encode(file_path, 'gbk_file_data/'+'gbk_'+file_path.split('/')[-1], "gbk"): print "%s--转码成功"%(file_path) else: print "%s--转码失败" % (file_path)