不需要考虑文件格式,用Tika包。
package com.geni_sage.gdme.core.dataReader; import java.io.*; import java.util.Arrays; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.tika.Tika; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; public class TikaManager { private Metadata metadata; private String content; private boolean isRepalceBlank = false; public TikaManager(File file, boolean isReplaceBlank) throws Exception { metadata = new Metadata(); TikaInputStream stream = TikaInputStream.get(file, metadata); try { Tika tika = new Tika(); tika.setMaxStringLength(Integer.MAX_VALUE); content = tika.parseToString(stream, metadata); } finally { stream.close(); } this.isRepalceBlank = isReplaceBlank; } public String getContent() { if (isRepalceBlank) { return replaceBlank(content); } else { return content; } } public Metadata getMetadata() { return metadata; } public String getMetadataString() throws Exception { return metadataToString(); } private String metadataToString() throws Exception { StringBuilder metadataBuffer = new StringBuilder(); String[] names = metadata.names(); Arrays.sort(names); for (String name : names) { metadataBuffer.append(name); metadataBuffer.append(": "); metadataBuffer.append(metadata.get(name)); metadataBuffer.append("\n"); } return metadataBuffer.toString(); } private String replaceBlank(String str) { String dest = ""; if (str != null) { // Pattern p = Pattern.compile("\\s*|\t|\r|\n"); Pattern p = Pattern.compile("\n"); Matcher m = p.matcher(str); dest = m.replaceAll(""); } return dest; } }