• Android--推断文本文件编码



    方法1:利用windows文本文件编码特点。


    windows下。Unicode、Unicode big endian和UTF-8编码的txt文件的开头会多出几个字节,各自是FF、FE(Unicode),FE、FF(Unicode big endian),EF、BB、BF(UTF-8)。


    public static String getCharset(File file) {
            String charset = "GBK";
            byte[] first3Bytes = new byte[3];
            try {
                boolean checked = false;
                BufferedInputStream bis = new BufferedInputStream(
                      new FileInputStream(file));
                bis.mark(0);
                int read = bis.read(first3Bytes, 0, 3);
                if (read == -1)
                    return charset;
                if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
                    charset = "UTF-16LE";
                    checked = true;
                } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1]
                    == (byte) 0xFF) {
                    charset = "UTF-16BE";
                    checked = true;
                } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1]
                        == (byte) 0xBB
                        && first3Bytes[2] == (byte) 0xBF) {
                    charset = "UTF-8";
                    checked = true;
                }
                bis.reset();
                if (!checked) {
                    int loc = 0;
                    while ((read = bis.read()) != -1) {
                        loc++;
                        if (read >= 0xF0)
                            break;
                        //单独出现BF下面的。也算是GBK
                        if (0x80 <= read && read <= 0xBF)
                            break;
                        if (0xC0 <= read && read <= 0xDF) {
                            read = bis.read();
                            if (0x80 <= read && read <= 0xBF)// 双字节 (0xC0 - 0xDF)
                                // (0x80 -
                                // 0xBF),也可能在GB编码内
                                continue;
                            else
                                break;
                         // 也有可能出错,可是几率较小
                        } else if (0xE0 <= read && read <= 0xEF) {
                            read = bis.read();
                            if (0x80 <= read && read <= 0xBF) {
                                read = bis.read();
                                if (0x80 <= read && read <= 0xBF) {
                                    charset = "UTF-8";
                                    break;
                                } else
                                    break;
                            } else
                                break;
                        }
                    }
                    System.out.println(loc + " " + Integer.toHexString(read));
                }
                bis.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
            return charset;
        }
    

    缺点:不能这样去探測linux下的文件。


    方法2:开源project JCharDet


    http://www.iteye.com/topic/266501

    package org.mozilla.intl.chardet;
    
    import java.io.BufferedInputStream;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileNotFoundException;
    import java.io.IOException;
    
    /**
     * 借助JCharDet获取文件字符集
     * @author icer
     * PS:
     * JCharDet 是mozilla自己主动字符集探測算法代码的java移植,其官方主页为:
     *      http://jchardet.sourceforge.net/
     * @date	2008/11/13 
     */
    public class FileCharsetDetector {
    
    	private boolean found = false;
    
    	/**
    	 * 假设全然匹配某个字符集检測算法, 则该属性保存该字符集的名称. 否则(如二进制文件)其值就为默认值 null, 这时应当查询属性 
    	 */
    	private String encoding = null;
    
    	public static void main(String[] argv) throws Exception {
    		if (argv.length != 1 && argv.length != 2) {
    
    			System.out
    					.println("Usage: FileCharsetDetector <path> [<languageHint>]");
    
    			System.out.println("");
    			System.out.println("Where <path> is d:/demo.txt");
    			System.out.println("For optional <languageHint>. Use following...");
    			System.out.println("		1 => Japanese");
    			System.out.println("		2 => Chinese");
    			System.out.println("		3 => Simplified Chinese");
    			System.out.println("		4 => Traditional Chinese");
    			System.out.println("		5 => Korean");
    			System.out.println("		6 => Dont know (default)");
    
    			return;
    		} else {
    			String encoding = null;
    			if (argv.length == 2) {
    				encoding = new FileCharsetDetector().guestFileEncoding(argv[0],
    						Integer.valueOf(argv[1]));
    			} else {
    				encoding = new FileCharsetDetector().guestFileEncoding(argv[0]);
    			}
    			System.out.println("文件编码:" + encoding);
    		}
    	}
    
    	/**
    	 * 传入一个文件(File)对象,检查文件编码
    	 * 
    	 * @param file
    	 *            File对象实例
    	 * @return 文件编码。若无,则返回null
    	 * @throws FileNotFoundException
    	 * @throws IOException
    	 */
    	public String guestFileEncoding(File file) throws FileNotFoundException,
    			IOException {
    		return geestFileEncoding(file, new nsDetector());
    	}
    
    	/**
    	 * 获取文件的编码
    	 * 
    	 * @param file
    	 *            File对象实例
    	 * @param languageHint
    	 *            语言提示区域代码 eg:1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;
    	 *            4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)
    	 * @return 文件编码,eg:UTF-8,GBK,GB2312形式。若无,则返回null
    	 * @throws FileNotFoundException
    	 * @throws IOException
    	 */
    	public String guestFileEncoding(File file, int languageHint)
    			throws FileNotFoundException, IOException {
    		return geestFileEncoding(file, new nsDetector(languageHint));
    	}
    
    	/**
    	 * 获取文件的编码
    	 * 
    	 * @param path
    	 *            文件路径
    	 * @return 文件编码,eg:UTF-8,GBK,GB2312形式,若无。则返回null
    	 * @throws FileNotFoundException
    	 * @throws IOException
    	 */
    	public String guestFileEncoding(String path) throws FileNotFoundException,
    			IOException {
    		return guestFileEncoding(new File(path));
    	}
    
    	/**
    	 * 获取文件的编码
    	 * 
    	 * @param path
    	 *            文件路径
    	 * @param languageHint
    	 *            语言提示区域代码 eg:1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;
    	 *            4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)
    	 * @return
    	 * @throws FileNotFoundException
    	 * @throws IOException
    	 */
    	public String guestFileEncoding(String path, int languageHint)
    			throws FileNotFoundException, IOException {
    		return guestFileEncoding(new File(path), languageHint);
    	}
    
    	/**
    	 * 获取文件的编码
    	 * 
    	 * @param file
    	 * @param det
    	 * @return
    	 * @throws FileNotFoundException
    	 * @throws IOException
    	 */
    	private String geestFileEncoding(File file, nsDetector det)
    			throws FileNotFoundException, IOException {
    		// Set an observer...
    		// The Notify() will be called when a matching charset is found.
    		det.Init(new nsICharsetDetectionObserver() {
    			public void Notify(String charset) {
    				found = true;
    				encoding = charset;
    			}
    		});
    
    		BufferedInputStream imp = new BufferedInputStream(new FileInputStream(
    				file));
    
    		byte[] buf = new byte[1024];
    		int len;
    		boolean done = false;
    		boolean isAscii = true;
    
    		while ((len = imp.read(buf, 0, buf.length)) != -1) {
    			// Check if the stream is only ascii.
    			if (isAscii)
    				isAscii = det.isAscii(buf, len);
    
    			// DoIt if non-ascii and not done yet.
    			if (!isAscii && !done)
    				done = det.DoIt(buf, len, false);
    		}
    		det.DataEnd();
    
    		if (isAscii) {
    			encoding = "ASCII";
    			found = true;
    		}
    
    		if (!found) {
    			String prob[] = det.getProbableCharsets();
    			if (prob.length > 0) {
    				// 在没有发现情况下,则取第一个可能的编码
    				encoding = prob[0];
    			} else {
    				return null;
    			}
    		}
    		return encoding;
    	}
    }
    

    jar包下载地址:http://download.csdn.net/detail/u012587637/8047697



    方法3:开源projectjuniversalcharde


    http://code.google.com/p/juniversalchardet/ 


    public static String getFileIncode(File file) {
    
    		if (!file.exists()) {
    			System.err.println("getFileIncode: file not exists!");
    			return null;
    		}
    
    		byte[] buf = new byte[4096];
    		FileInputStream fis = null;
    		try {
    			fis = new FileInputStream(file);
    			// (1)
    			UniversalDetector detector = new UniversalDetector(null);
    
    			// (2)
    			int nread;
    			while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
    				detector.handleData(buf, 0, nread);
    			}
    			// (3)
    			detector.dataEnd();
    
    			// (4)
    			String encoding = detector.getDetectedCharset();
    			if (encoding != null) {
    				System.out.println("Detected encoding = " + encoding);
    			} else {
    				System.out.println("No encoding detected.");
    			}
    
    			// (5)
    			detector.reset();
    			fis.close();
    			return encoding;
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
    
    		return null;
    	}


    引入包的方法:

    将包放入libs目录。

    选中包,右键 --> build path--> add to build path。


    jar包下载:http://download.csdn.net/detail/u012587637/8041181


    说明:第三个方法要比第二个速度快些,也比較新,所以推荐使用第三个。



  • 相关阅读:
    Demo学习: DownloadDemo
    Demo学习: FileUpload
    Demo学习: Dialogs Anonymous Callback
    Demo学习: Cookies Demo
    Demo学习: CustomException
    Demo学习: Collapsible Panels
    Demo学习: ColumnSort
    Demo学习: ClientInfo
    Demo学习: Closable Tabs
    Demo学习: ClientEvents
  • 原文地址:https://www.cnblogs.com/wzzkaifa/p/6771195.html
Copyright © 2020-2023  润新知