• 检测编码


    public static Encoding determineEncoding(RandomAccessFile file) {
    	    Encoding enc = Encoding.GBK;
    	    try {
    		    file.seek(0);
    		    if(file.length() < 3) return enc;
    		    byte[] bom = new byte[3]; //byte order mark
    		    file.read(bom);
    		    
    		    if((bom[0] & 0XFF) == 0xFF && (bom[1] & 0XFF) == 0xFE) 
    		    	enc = Encoding.UTF16LE;
    		    else if((bom[0] & 0XFF) == 0xFE && (bom[1] & 0XFF) == 0xFF) 
    		    	enc = Encoding.UTF16BE;
    		    else if((bom[0] & 0XFF) == 0xEF && (bom[1] & 0XFF) == 0xBB && (bom[2] & 0XFF) == 0xBF) 
    		    	enc = Encoding.UTF8;
    		    else {//test if the file is encoded using GBK or BIG5 character set
    		        int gbkCount = 0;
    		        int big5Count = 0;
    		        int utf16leCount = 0;
    		        int utf16beCount = 0;
    		        int utf8Count = 0;
    		        
    		        file.seek(0);
    		        byte[] bs = new byte[4096];
    		        file.read(bs);
    		        int len = bs.length - 2;
    		        //look up the Chinese characters "�?
    		        for(int i = 0; i < len; ++i) {
    		        	if((bs[i] & 0xFF) == 0xB5 && (bs[i + 1] & 0xFF) == 0xC4) {
    		        		++gbkCount;
    		        		++i;
    		        	} else if ((bs[i] & 0xFF) == 0xE7 && (bs[i + 1] & 0xFF) == 0x9A && (bs[i + 2] & 0xFF) == 0x84) {
    		        		++utf8Count;
    		                i += 2;
    		        	} else if ((bs[i] & 0xFF) == 0x84 && (bs[i + 1] & 0xFF) == 0x76) {
    		        		++utf16leCount;
    		        		++i;
    		        	} else if ((bs[i] & 0xFF) == 0x76 && (bs[i + 1] & 0xFF) == 0x84) {
    		        		++utf16beCount;
    		        		++i;
    		        	} else if ((bs[i] & 0xFF) == 0xAA && (bs[i + 1] & 0xFF) == 0xBA) {
    		        		++big5Count;
    		        		++i;
    		        	}
    		        }       
    	
    		        if(gbkCount > utf8Count && gbkCount > big5Count && gbkCount > utf16leCount && gbkCount > utf16beCount) 
    		        	enc = Encoding.GBK;
    		        else if(utf8Count > gbkCount && utf8Count > big5Count && utf8Count > utf16leCount && utf8Count > utf16beCount)
    		        	enc = Encoding.UTF8;
    		        else if(utf16leCount > gbkCount && utf16leCount > big5Count && utf16leCount > utf8Count && utf16leCount > utf16beCount)
    		        	enc = Encoding.UTF16LE;
    		        else if(utf16beCount > gbkCount && utf16beCount > big5Count && utf16beCount > utf16leCount && utf16beCount > utf16leCount)
    		        	enc = Encoding.UTF16BE;
    		        else if(big5Count > gbkCount && big5Count > utf8Count && big5Count > utf16leCount && big5Count > utf16beCount)
    		        	enc = Encoding.BIG5;
    		    }
    	    } catch (Exception ex) {
    	    	Log.e("File ERROR", "encoding detection failed.");
    	    }
    	    return enc;
    	}
    	


    public enum Encoding {
    	GBK("GBK"),
    	BIG5("BIG5"),
    	UTF8("UTF-8"),
    	UTF16BE("UTF-16BE"),
    	UTF16LE("UTF-16LE"),
    	UNKNOWN("UNKNOWN");
    	
    	private Encoding (String name) {
    		this.name = name;
    		try {
    			maxCharLength = "中".getBytes(name).length;
    		} catch (Exception e) {}
    	}
    	
    	private String name;
    	public String getName() {
    		return name;
    	}
    	
    	private int maxCharLength;
    	public int getMaxCharLength() {
    		return maxCharLength;
    	}
    }
    


  • 相关阅读:
    Emacs for OIer 的一些配置
    CF1336E Chiori and Doll Picking 【线性代数,组合计数】
    CF605E Intergalaxy Trips 【贪心,动态规划,期望】
    Luogu6329 【模板】点分树 | 震波
    [SDOI2014]数表
    [BZOJ4403]序列统计
    [BZOJ5099]Pionek
    SP1812 LCS2
    SA & SAM
    [HAOI2016]找相同字符
  • 原文地址:https://www.cnblogs.com/javawebsoa/p/3006013.html
Copyright © 2020-2023  润新知