最近实习工作要求将一些txt文本从Unicode16(小端模式)编码转为ISO-8859之类的编码,我能想到的途径有三种:
1)使用IBM的ICU工具包;
2)使用Java的转换方法;
3)自己通过Map表编写程序;
为了节省时间,我最终选择了方法2),但由于许久没用过Java及开始并不懂Java的编码方式,导致纠结了好久才搞定,下面就简单的阐述一下具体的方法。
在Java中实现编码转换主要用到了两个函数:
- getBytes(decode): 根据指定的decode编码返回某字符串在该编码下的byte数组表示
- new String(byte[] b, encoding):使用decode指定的编码来将byte[]解析成字符串,该字符串的编码为Java默认的Unicode16(BE)
始终要注意Java的默认编码方式是Unicode16(BE),所以new String得到的字符串对应的编码也是Unicode16(BE)。
读写文本文件采用的是InputStream和OutStream的方式,目的是为了对字节流进行读写,这样就可以避免在读写时改变了编码格式,示例代码如下:
1 import java.io.*; 2 import java.nio.charset.Charset; 3 import java.util.Iterator; 4 import java.util.Set; 5 import java.io.UnsupportedEncodingException; 6 7 public class Convertor { 8 public static final String Old_Charset = "Unicode"; 9 public static final String ISO_8859_6 = "ISO-8859-6"; 10 public static final String ISO_8859_7 = "ISO-8859-7"; 11 public static final String ISO_8859_8 = "ISO-8859-8"; 12 public static final String TIS_620 = "TIS-620"; 13 14 public static void PrintAllSupportedCharset() 15 { 16 Set<String> charsetNames = Charset.availableCharsets().keySet(); 17 System.out.println("-----the number of jdk1.67's charset is "+charsetNames.size()+"-----"); 18 for (Iterator<String> it = charsetNames.iterator(); it.hasNext();) 19 { 20 String charsetName = (String) it.next(); 21 System.out.println(charsetName); 22 } 23 } 24 25 public static void PrintBytes(byte[] b) 26 { 27 for (int i = 0; i < b.length; i++) { 28 String hex = Integer.toHexString(b[i] & 0xFF); 29 if (hex.length() == 1) { 30 hex = '0' + hex; 31 } 32 System.out.print(hex.toUpperCase() + " "); 33 } 34 System.out.println(); 35 } 36 37 public static void ConvertCharset(String filePath, String fileName, String destDir, String oldCharset, String newCharset) 38 { 39 try 40 { 41 System.out.println(filePath); 42 InputStream in = new FileInputStream(filePath); 43 44 String srcStr = ""; 45 if(in != null) 46 { 47 int byteNum = in.available(); 48 byte[] b = new byte[byteNum]; 49 in.read(b);//以字节流方式读入源文件 50 in.close(); 51 PrintBytes(b); 52 //Convert 53 srcStr += new String(b, oldCharset); //生成的新的字符串是Unicode16(BE)编码的 54 byte[] nnb = srcStr.getBytes(newCharset);//获取Unicode16编码的字符串对应newCharset的字节数组,从而实现转码 55 PrintBytes(nnb); 56 String destFilePath = destDir + fileName; 57 OutputStream out = new FileOutputStream(destFilePath); 58 out.write(nnb);//以字节流方式输出 59 out.flush(); 60 out.close(); 61 } 62 }catch(Exception e) 63 { 64 e.printStackTrace(); 65 } 66 } 67 68 public static boolean readfile(String filepath, String destDir, String newCharset) throws FileNotFoundException, IOException 69 { 70 try { 71 72 File file = new File(filepath); 73 if (!file.isDirectory()) { 74 System.out.println("文件"); 75 System.out.println("path=" + file.getPath()); 76 System.out.println("absolutepath=" + file.getAbsolutePath()); 77 System.out.println("name=" + file.getName()); 78 ConvertCharset(file.getAbsolutePath(), file.getName(), destDir, Convertor.Old_Charset, newCharset); 79 80 81 } else if (file.isDirectory()) { 82 System.out.println("文件夹"); 83 String[] filelist = file.list(); 84 for (int i = 0; i < filelist.length; i++) { 85 File readfile = new File(filepath + "\" + filelist[i]); 86 if (!readfile.isDirectory()) { 87 System.out.println("path=" + readfile.getPath()); 88 System.out.println("absolutepath=" 89 + readfile.getAbsolutePath()); 90 System.out.println("name=" + readfile.getName()); 91 ConvertCharset(readfile.getAbsolutePath(), readfile.getName(), destDir, Convertor.Old_Charset, newCharset); 92 93 } else if (readfile.isDirectory()) { 94 readfile(filepath + "\" + filelist[i], destDir, newCharset); 95 } 96 } 97 } 98 99 } catch (FileNotFoundException e) { 100 System.out.println("readfile() Exception:" + e.getMessage()); 101 } 102 return true; 103 } 104 105 106 107 public static void main(String args[]) throws FileNotFoundException, IOException 108 { 109 //PrintAllSupportedCharset(); 110 //System.out.println(Charset.defaultCharset()); 111 String srcPath = "H:\LYDATA\四国语言\Unicode16\Thai_Unicode\"; 112 String destDir = "H:\LYDATA\四国语言\Thai_TIS-620\"; 113 readfile(srcPath, destDir, Convertor.TIS_620); 114 } 115 }
几种编码方式对应的Unicode映射表:http://www.lingua-systems.com/knowledge/unicode-mappings/iso-8859-7-to-unicode.html