最近在用Dom4j解析XML文件,遇到了一些问题,记录如下:
1. BOM头问题,得到的异常是:
Nested exception: org.xml.sax.SAXParseException: Content is not allowed in prolog.
(1)http://koti.mbnet.fi/akini/java/unicodereader/,里面提供了两个删掉BOM头的方法(我用了第一个):
UnicodeInputStream
1 /** 2 version: 1.1 / 2007-01-25 3 - changed BOM recognition ordering (longer boms first) 4 5 Original pseudocode : Thomas Weidenfeller 6 Implementation tweaked: Aki Nieminen 7 8 http://www.unicode.org/unicode/faq/utf_bom.html 9 BOMs in byte length ordering: 10 00 00 FE FF = UTF-32, big-endian 11 FF FE 00 00 = UTF-32, little-endian 12 EF BB BF = UTF-8, 13 FE FF = UTF-16, big-endian 14 FF FE = UTF-16, little-endian 15 16 Win2k Notepad: 17 Unicode format = UTF-16LE 18 19 This class will help you to autorecognize and skip BOMs. This will support UTF-8 as well. 20 ***/ 21 22 import java.io.*; 23 24 /** 25 * This inputstream will recognize unicode BOM marks 26 * and will skip bytes if getEncoding() method is called 27 * before any of the read(...) methods. 28 * 29 * Usage pattern: 30 String enc = "ISO-8859-1"; // or NULL to use systemdefault 31 FileInputStream fis = new FileInputStream(file); 32 UnicodeInputStream uin = new UnicodeInputStream(fis, enc); 33 enc = uin.getEncoding(); // check and skip possible BOM bytes 34 InputStreamReader in; 35 if (enc == null) in = new InputStreamReader(uin); 36 else in = new InputStreamReader(uin, enc); 37 */ 38 public class UnicodeInputStream extends InputStream { 39 PushbackInputStream internalIn; 40 boolean isInited = false; 41 String defaultEnc; 42 String encoding; 43 44 private static final int BOM_SIZE = 4; 45 46 UnicodeInputStream(InputStream in, String defaultEnc) { 47 internalIn = new PushbackInputStream(in, BOM_SIZE); 48 this.defaultEnc = defaultEnc; 49 } 50 51 public String getDefaultEncoding() { 52 return defaultEnc; 53 } 54 55 public String getEncoding() { 56 if (!isInited) { 57 try { 58 init(); 59 } catch (IOException ex) { 60 IllegalStateException ise = new IllegalStateException("Init method failed."); 61 ise.initCause(ise); 62 throw ise; 63 } 64 } 65 return encoding; 66 } 67 68 /** 69 * Read-ahead four bytes and check for BOM marks. Extra bytes are 70 * unread back to the stream, only BOM bytes are skipped. 71 */ 72 protected void init() throws IOException { 73 if (isInited) return; 74 75 byte bom[] = new byte[BOM_SIZE]; 76 int n, unread; 77 n = internalIn.read(bom, 0, bom.length); 78 79 if ( (bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) && 80 (bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF) ) { 81 encoding = "UTF-32BE"; 82 unread = n - 4; 83 } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) && 84 (bom[2] == (byte)0x00) && (bom[3] == (byte)0x00) ) { 85 encoding = "UTF-32LE"; 86 unread = n - 4; 87 } else if ( (bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) && 88 (bom[2] == (byte)0xBF) ) { 89 encoding = "UTF-8"; 90 unread = n - 3; 91 } else if ( (bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF) ) { 92 encoding = "UTF-16BE"; 93 unread = n - 2; 94 } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) ) { 95 encoding = "UTF-16LE"; 96 unread = n - 2; 97 } else { 98 // Unicode BOM mark not found, unread all bytes 99 encoding = defaultEnc; 100 unread = n; 101 } 102 //System.out.println("read=" + n + ", unread=" + unread); 103 104 if (unread > 0) internalIn.unread(bom, (n - unread), unread); 105 106 isInited = true; 107 } 108 109 public void close() throws IOException { 110 //init(); 111 isInited = true; 112 internalIn.close(); 113 } 114 115 public int read() throws IOException { 116 //init(); 117 isInited = true; 118 return internalIn.read(); 119 } 120 }
UnicodeReader
1 /** 2 version: 1.1 / 2007-01-25 3 - changed BOM recognition ordering (longer boms first) 4 5 Original pseudocode : Thomas Weidenfeller 6 Implementation tweaked: Aki Nieminen 7 8 http://www.unicode.org/unicode/faq/utf_bom.html 9 BOMs: 10 00 00 FE FF = UTF-32, big-endian 11 FF FE 00 00 = UTF-32, little-endian 12 EF BB BF = UTF-8, 13 FE FF = UTF-16, big-endian 14 FF FE = UTF-16, little-endian 15 16 Win2k Notepad: 17 Unicode format = UTF-16LE 18 19 This class will do everything ever more transparently. Just instantiate it and read text. 20 ***/ 21 22 import java.io.*; 23 24 /** 25 * Generic unicode textreader, which will use BOM mark 26 * to identify the encoding to be used. If BOM is not found 27 * then use a given default or system encoding. 28 */ 29 public class UnicodeReader extends Reader { 30 PushbackInputStream internalIn; 31 InputStreamReader internalIn2 = null; 32 String defaultEnc; 33 34 private static final int BOM_SIZE = 4; 35 36 /** 37 * 38 * @param in inputstream to be read 39 * @param defaultEnc default encoding if stream does not have 40 * BOM marker. Give NULL to use system-level default. 41 */ 42 UnicodeReader(InputStream in, String defaultEnc) { 43 internalIn = new PushbackInputStream(in, BOM_SIZE); 44 this.defaultEnc = defaultEnc; 45 } 46 47 public String getDefaultEncoding() { 48 return defaultEnc; 49 } 50 51 /** 52 * Get stream encoding or NULL if stream is uninitialized. 53 * Call init() or read() method to initialize it. 54 */ 55 public String getEncoding() { 56 if (internalIn2 == null) return null; 57 return internalIn2.getEncoding(); 58 } 59 60 /** 61 * Read-ahead four bytes and check for BOM marks. Extra bytes are 62 * unread back to the stream, only BOM bytes are skipped. 63 */ 64 protected void init() throws IOException { 65 if (internalIn2 != null) return; 66 67 String encoding; 68 byte bom[] = new byte[BOM_SIZE]; 69 int n, unread; 70 n = internalIn.read(bom, 0, bom.length); 71 72 if ( (bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) && 73 (bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF) ) { 74 encoding = "UTF-32BE"; 75 unread = n - 4; 76 } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) && 77 (bom[2] == (byte)0x00) && (bom[3] == (byte)0x00) ) { 78 encoding = "UTF-32LE"; 79 unread = n - 4; 80 } else if ( (bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) && 81 (bom[2] == (byte)0xBF) ) { 82 encoding = "UTF-8"; 83 unread = n - 3; 84 } else if ( (bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF) ) { 85 encoding = "UTF-16BE"; 86 unread = n - 2; 87 } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) ) { 88 encoding = "UTF-16LE"; 89 unread = n - 2; 90 } else { 91 // Unicode BOM mark not found, unread all bytes 92 encoding = defaultEnc; 93 unread = n; 94 } 95 //System.out.println("read=" + n + ", unread=" + unread); 96 97 if (unread > 0) internalIn.unread(bom, (n - unread), unread); 98 99 // Use given encoding 100 if (encoding == null) { 101 internalIn2 = new InputStreamReader(internalIn); 102 } else { 103 internalIn2 = new InputStreamReader(internalIn, encoding); 104 } 105 } 106 107 public void close() throws IOException { 108 init(); 109 internalIn2.close(); 110 } 111 112 public int read(char[] cbuf, int off, int len) throws IOException { 113 init(); 114 return internalIn2.read(cbuf, off, len); 115 } 116 117 }
(2)下面的代码分析了产生BOM头的原因:
ContentNotAllowedInProlog
import java.io.*; import java.nio.charset.Charset; import javax.xml.parsers.*; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; public class ContentNotAllowedInProlog { private static void parse(InputStream stream) throws SAXException, ParserConfigurationException, IOException { SAXParserFactory.newInstance().newSAXParser().parse(stream, new DefaultHandler()); } public static void main(String[] args) { String[] encodings = { "UTF-8", "UTF-16", "ISO-8859-1" }; for (String actual : encodings) { for (String declared : encodings) { if (actual != declared) { String xml = "<?xml version='1.0' encoding='" + declared + "'?><x/>"; byte[] encoded = xml.getBytes(Charset.forName(actual)); try { parse(new ByteArrayInputStream(encoded)); System.out.println("HIDDEN ERROR! actual:" + actual + " " + xml); } catch (Exception e) { System.out.println(e.getMessage() + " actual:" + actual + " xml:" + xml); } } } } } }
2. <!-- -->注释问题,得到的异常是:
Nested exception: org.xml.sax.SAXParseException: The string "--" is not permitted within comments.
产生这个异常的原因有很多。其中一个是<!-- -->中间有“--”字符;另外一个是“-->”前面不是空格,如“abc-->”则会抛出异常,而不是“abc -->”则不会。
我的解决方式就是:删掉所有注释!
针对这两个问题,写了个简单工具类:
XmlUtil
1 import java.io.BufferedReader; 2 import java.io.FileInputStream; 3 import java.io.FileNotFoundException; 4 import java.io.FileOutputStream; 5 import java.io.IOException; 6 import java.io.InputStream; 7 import java.io.InputStreamReader; 8 import java.io.OutputStreamWriter; 9 import java.io.PushbackInputStream; 10 import java.io.UnsupportedEncodingException; 11 import java.util.HashMap; 12 import java.util.List; 13 import org.dom4j.DocumentHelper; 14 import org.dom4j.Element; 15 import org.dom4j.XPath; 16 import org.slf4j.Logger; 17 import org.slf4j.LoggerFactory; 18 19 public class XmlUtil extends InputStream { 20 private static final Logger log = (Logger) LoggerFactory 21 .getLogger(XmlUtil.class); 22 private static final int BOM_SIZE = 4; 23 PushbackInputStream internalIn; 24 boolean isInited = false; 25 String defaultEnc; 26 String encoding; 27 28 public XmlUtil(InputStream in, String defaultEnc) { 29 internalIn = new PushbackInputStream(in, BOM_SIZE); 30 this.defaultEnc = defaultEnc; 31 } 32 33 public String getDefaultEncoding() { 34 return defaultEnc; 35 } 36 37 /** 38 * Read-ahead four bytes and check for BOM marks. Extra bytes are unread 39 * back to the stream, only BOM bytes are skipped. 40 */ 41 protected void initXmlBOM() throws IOException { 42 if (isInited) 43 return; 44 45 byte bom[] = new byte[BOM_SIZE]; 46 int n, unread; 47 n = internalIn.read(bom, 0, bom.length); 48 49 if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) 50 && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { 51 encoding = "UTF-32BE"; 52 unread = n - 4; 53 } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) 54 && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { 55 encoding = "UTF-32LE"; 56 unread = n - 4; 57 } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) 58 && (bom[2] == (byte) 0xBF)) { 59 encoding = "UTF-8"; 60 unread = n - 3; 61 } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { 62 encoding = "UTF-16BE"; 63 unread = n - 2; 64 } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { 65 encoding = "UTF-16LE"; 66 unread = n - 2; 67 } else { 68 // Unicode BOM mark not found, unread all bytes 69 encoding = defaultEnc; 70 unread = n; 71 } 72 // log.info("read=" + n + ", unread=" + unread); 73 74 if (unread > 0) 75 internalIn.unread(bom, (n - unread), unread); 76 77 isInited = true; 78 } 79 80 public String getEncoding() { 81 if (!isInited) { 82 try { 83 initXmlBOM(); 84 } catch (IOException ex) { 85 IllegalStateException ise = new IllegalStateException( 86 "Init method failed."); 87 ise.initCause(ise); 88 throw ise; 89 } 90 } 91 return encoding; 92 } 93 94 public static void removeXmlBomAndComment(String filePath) { 95 XmlUtil uins = null; 96 BufferedReader bufr = null; 97 OutputStreamWriter osw = null; 98 String enc = "ISO-8859-1"; 99 100 String fileContent = ""; 101 String leftBracket = "<!--"; 102 String rightBracket = "-->"; 103 int leftBracketIndex = 0; 104 int rightBracketIndex = 0; 105 106 String line = ""; 107 StringBuffer fileContentBuffer = new StringBuffer(); 108 try { 109 // 根据BOM Mark编码方式,对文件进行重新编码 110 uins = new XmlUtil(new FileInputStream(filePath), enc); 111 enc = uins.getEncoding(); 112 113 if (enc == null) { 114 bufr = new BufferedReader(new InputStreamReader(uins)); 115 } else { 116 bufr = new BufferedReader(new InputStreamReader(uins, enc)); 117 } 118 119 while ((line = bufr.readLine()) != null) { 120 fileContentBuffer.append(line); 121 } 122 uins.close(); 123 bufr.close(); 124 125 // 删除"<!-- -->"格式的注释 126 fileContent = fileContentBuffer.toString(); 127 leftBracketIndex = fileContent.indexOf(leftBracket); 128 rightBracketIndex = fileContent.indexOf(rightBracket); 129 while (leftBracketIndex < rightBracketIndex 130 && rightBracketIndex != 0) { 131 fileContent = fileContent.substring(0, leftBracketIndex) 132 + fileContent.substring(rightBracketIndex + 3, 133 fileContent.length()); 134 leftBracketIndex = fileContent.indexOf(leftBracket); 135 rightBracketIndex = fileContent.indexOf(rightBracket); 136 } 137 138 // 将处理过的内容,写入文件 139 osw = new OutputStreamWriter(new FileOutputStream(filePath)); 140 osw.write(fileContent); 141 osw.flush(); 142 osw.close(); 143 144 } catch (FileNotFoundException e) { 145 e.printStackTrace(); 146 } catch (UnsupportedEncodingException e) { 147 e.printStackTrace(); 148 } catch (IOException e) { 149 e.printStackTrace(); 150 } finally { 151 if (uins != null) { 152 try { 153 uins.close(); 154 } catch (IOException e) { 155 e.printStackTrace(); 156 } 157 } 158 159 if (bufr != null) { 160 try { 161 bufr.close(); 162 } catch (IOException e) { 163 e.printStackTrace(); 164 } 165 } 166 167 if (osw != null) { 168 try { 169 osw.close(); 170 } catch (IOException e) { 171 e.printStackTrace(); 172 } 173 } 174 } 175 } 176 177 /** 178 * 如果根元素有声明命名空间,通过xpath匹配子元素时,需要特殊处理。 179 * */ 180 public static List<Element> getNameSpaceElement(Element root, String node) { 181 // 获得节点的命名空间 182 HashMap<String, String> map = new HashMap<String, String>(); 183 map.put("mvn", root.getNamespaceURI()); 184 XPath xpath = DocumentHelper.createXPath("//mvn:" + node); 185 xpath.setNamespaceURIs(map); 186 187 @SuppressWarnings("unchecked") 188 List<Element> selectedNodes = (List<Element>) xpath.selectNodes(root 189 .getDocument()); 190 return selectedNodes; 191 } 192 193 @Override 194 public void close() throws IOException { 195 // init(); 196 isInited = true; 197 internalIn.close(); 198 } 199 200 @Override 201 public int read() throws IOException { 202 // init(); 203 isInited = true; 204 return internalIn.read(); 205 } 206 }