• 【解决方案】Java获取文件字符集格式


    背景:

    excel格式,不管是.xlx 还是 .xlsx, 每个单元格cell都有容量限制,最大容量是32767字节,不满足我们的需求,所以我们使用逗号分隔值文件,即.csv格式,本质是以纯文本形式存储表格数据。
    但是在使用csv文件进行数据的导入导出过程中发现,如果将下载下来的csv文件进行修改保存,再导入,会有乱码问题。原因是对文件修改的同时,也修改了它的编码格式。
    以下是为了解决编码格式问题引入的字符集工具类,通过获取导入文件的字符集类型,再在解析过程前指定字符集就能解析出非乱码的数据。

    package com.example.test;
    
    import lombok.Cleanup;
    import lombok.NonNull;
    import lombok.extern.slf4j.Slf4j;
    import org.apache.commons.lang3.StringUtils;
    
    import java.io.*;
    import java.util.BitSet;
    
    /**
     *  @Description: 编码集工具类
     *  @author miaoying
     *  @date 2020/9/24
     */
    @Slf4j
    public class EncodeUtil {
        private static int BYTE_SIZE = 8;
        public static String CODE_UTF8 = "UTF-8";
        public static String CODE_UTF8_BOM = "UTF-8_BOM";
        public static String CODE_GBK = "GBK";
        public static String CODE_UNICODE = "Unicode";
        public static String CODE_UTF16 = "UTF-16";
    
        /**
         * 通过文件获取编码集名称
         *
         * @param file
         * @param ignoreBom
         * @return
         * @throws Exception
         */
        public static String getEncode(File file, boolean ignoreBom) throws Exception {
            BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file));
            return getEncode(bis, ignoreBom);
        }
    
        /**
         * 通过文件缓存流获取编码集名称
         *
         * @param bis
         * @return
         * @throws Exception
         */
        public static String getEncode(@NonNull BufferedInputStream bis, boolean ignoreBom) throws Exception {
            bis.mark(0);
    
            String encodeType = StringUtils.EMPTY;
            byte[] head = new byte[3];
            bis.read(head);
            if (head[0] == -1 && head[1] == -2) {
                encodeType = CODE_UTF16;
            } else if (head[0] == -2 && head[1] == -1) {
                encodeType = CODE_UNICODE;
            } //带BOM
            else if (head[0] == -17 && head[1] == -69 && head[2] == -65) {
                if (ignoreBom) {
                    encodeType = CODE_UTF8;
                } else {
                    encodeType = CODE_UTF8_BOM;
                }
            } else if (CODE_UNICODE.equals(encodeType)) {
                encodeType = CODE_UTF16;
            } else if (isUTF8(bis)) {
                encodeType = CODE_UTF8;
            } else {
                encodeType = CODE_GBK;
            }
            log.info("encodeType : " + encodeType);
            return encodeType;
        }
    
        /**
         * 是否是无BOM的UTF8格式,不判断常规场景,只区分无BOM UTF8和GBK
         *
         * @param bis
         * @return
         */
        private static boolean isUTF8(@NonNull BufferedInputStream bis) throws Exception {
            bis.reset();
            int code = bis.read();
            do {
                BitSet bitSet = convert2BitSet(code);
                //判断是否为单字节
                if (bitSet.get(0)) {
                    //多字节时,再读取N个字节
                    if (!checkMultiByte(bis, bitSet)) {
                        return false;
                    }
                } else {
                    //单字节时什么都不用做,再次读取字节
                }
                code = bis.read();
            } while (code != -1);
            return true;
        }
    
        /**
         * 检测多字节,判断是否为utf8,已经读取了一个字节
         *
         * @param bis
         * @param bitSet
         * @return
         */
        private static boolean checkMultiByte(@NonNull BufferedInputStream bis, @NonNull BitSet bitSet) throws Exception {
            int count = getCountOfSequential(bitSet);
            //已经读取了一个字节,不能再读取
            byte[] bytes = new byte[count - 1];
            bis.read(bytes);
            for (byte b : bytes) {
                if (!checkUtf8Byte(b)) {
                    return false;
                }
            }
            return true;
        }
    
        /**
         * 检测单字节,判断是否为utf8
         *
         * @param b
         * @return
         */
        private static boolean checkUtf8Byte(byte b) {
            BitSet bitSet = convert2BitSet(b);
            return bitSet.get(0) && !bitSet.get(1);
        }
    
        /**
         * 检测bitSet中从开始有多少个连续的1
         *
         * @param bitSet
         * @return
         */
        private static int getCountOfSequential(@NonNull BitSet bitSet) {
            int count = 0;
            for (int i = 0; i < BYTE_SIZE; i++) {
                if (bitSet.get(i)) {
                    count++;
                } else {
                    break;
                }
            }
            return count;
        }
    
    
        /**
         * 将整形转为BitSet
         *
         * @param code
         * @return
         */
        private static BitSet convert2BitSet(int code) {
            BitSet bitSet = new BitSet(BYTE_SIZE);
    
            for (int i = 0; i < BYTE_SIZE; i++) {
                int tmp3 = code >> (BYTE_SIZE - i - 1);
                int tmp2 = 0x1 & tmp3;
                if (tmp2 == 1) {
                    bitSet.set(i);
                }
            }
            return bitSet;
        }
    
        /**
         * 将一指定编码的文件转换为另一编码的文件
         *
         * @param oldFullFileName
         * @param oldCharsetName
         * @param newFullFileName
         * @param newCharsetName
         */
        public static void convert(String oldFullFileName, String oldCharsetName, String newFullFileName, String newCharsetName) throws Exception {
            log.info("the old file name is : {}, The oldCharsetName is : {}", oldFullFileName, oldCharsetName);
            log.info("the new file name is : {}, The newCharsetName is : {}", newFullFileName, newCharsetName);
    
            StringBuffer content = new StringBuffer();
    
            @Cleanup
            BufferedReader bin = new BufferedReader(new InputStreamReader(new FileInputStream(oldFullFileName), oldCharsetName));
            String line;
            while ((line = bin.readLine()) != null) {
                content.append(line);
                content.append(System.getProperty("line.separator"));
            }
            newFullFileName = newFullFileName.replace("\", "/");
            File dir = new File(newFullFileName.substring(0, newFullFileName.lastIndexOf("/")));
            if (!dir.exists()) {
                dir.mkdirs();
            }
            @Cleanup
            Writer out = new OutputStreamWriter(new FileOutputStream(newFullFileName), newCharsetName);
            out.write(content.toString());
        }
    
    }

  • 相关阅读:
    数据库连接池大小
    java的关闭钩子(Shutdown Hook)
    为线程池中的每个线程设置UncaughtExceptionHandler
    java 线程的interrupt和sleep、wait
    中断
    NIO
    VMware 安装 VMware Tools 工具
    php 雪花算法
    python 执行系统文件
    php curl 获取响应头
  • 原文地址:https://www.cnblogs.com/miaoying/p/13724420.html
Copyright © 2020-2023  润新知