文件内容读取工具类,亲测可用
maven依赖:
<dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.16</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.16</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.16</version> </dependency> <dependency> <groupId>org.apache.xmlbeans</groupId> <artifactId>xmlbeans</artifactId> <version>2.6.0</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-collections4</artifactId> <version>4.1</version> </dependency>
工具类:
import org.apache.commons.io.FileUtils; import org.apache.pdfbox.io.RandomAccessBuffer; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.xmlbeans.XmlException; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; /** * @author wangshuaijun * @description * 读取文件工具类:支持以下文件内容读取 * 1. word(.doc),word(.docx) * 2. excel(.xls),excel(xlsx) * 3. pdf * 4. txt * 5. ppt(.ppt),pptx(,pptx) * @date 2019年4月19日10:52:45 * */ public class ReadFileUtils { /** * 根据文件类型返回文件内容 * @param filepath * @return * @throws IOException */ public static String getContentByPath(String filepath) throws IOException{ String []fileTypeArr=filepath.split( "\." ); String fileType=fileTypeArr[fileTypeArr.length-1]; if("doc".equals( fileType ) || "docx".equals( fileType )){ return readWord( filepath,fileType ); }else if("xlsx".equals( fileType ) || "xls".equals( fileType )){ return readExcel( fileType,filepath ); }else if("txt".equals( fileType )){ return readTxt(filepath); }else if("pdf".equals( fileType )){ return readPdf(filepath); }else if("ppt".equals( fileType ) || "pptx".equals( fileType )){ return readPPT(fileType,filepath); }else{ System.out.println("不支持的文件类型!"); } return ""; } /** * 读取PDF中的内容 * @param filePath * @return */ public static String readPdf(String filePath){ FileInputStream fileInputStream=null; PDDocument pdDocument=null; String content=""; try { //创建输入流对象 fileInputStream = new FileInputStream(filePath); //创建解析器对象 PDFParser pdfParser = new PDFParser(new RandomAccessBuffer(fileInputStream)); pdfParser.parse(); //pdf文档 pdDocument = pdfParser.getPDDocument(); //pdf文本操作对象,使用该对象可以获取所读取pdf的一些信息 PDFTextStripper pdfTextStripper = new PDFTextStripper(); content = pdfTextStripper.getText(pdDocument); }catch(IOException e){ e.printStackTrace(); }finally{ try { //PDDocument对象时使用完后必须要关闭 if(null!=pdDocument){ pdDocument.close(); } if(null!=fileInputStream){ fileInputStream.close(); } }catch (IOException e){ e.printStackTrace(); } } return content; } /** * 读取Excel中的内容 * @param filePath * @return * @throws IOException */ private static String readTxt(String filePath) throws IOException{ File f = new File(filePath); return FileUtils.readFileToString( f,"GBK" ); } /** * 读取Excel中的内容 * @param filePath * @return */ private static String readExcel(String fileType,String filePath){ try { File excel = new File(filePath); if (excel.isFile() && excel.exists()) { //判断文件是否存在 Workbook wb; //根据文件后缀(xls/xlsx)进行判断 if ( "xls".equals(fileType)){ FileInputStream fis = new FileInputStream(excel); //文件流对象 wb = new HSSFWorkbook(fis); }else if ("xlsx".equals(fileType)){ wb = new XSSFWorkbook(excel); }else { System.out.println("文件类型错误!"); return ""; } //开始解析,获取页签数 StringBuffer sb=new StringBuffer(""); for(int i=0;i<wb.getNumberOfSheets();i++){ Sheet sheet = wb.getSheetAt(i); //读取sheet sb.append( sheet.getSheetName() +"_"); int firstRowIndex = sheet.getFirstRowNum()+1; //第一行是列名,所以不读 int lastRowIndex = sheet.getLastRowNum(); for(int rIndex = firstRowIndex; rIndex <= lastRowIndex; rIndex++) { //遍历行 Row row = sheet.getRow(rIndex); if (row != null) { int firstCellIndex = row.getFirstCellNum(); int lastCellIndex = row.getLastCellNum(); for (int cIndex = firstCellIndex; cIndex < lastCellIndex; cIndex++) { //遍历列 Cell cell = row.getCell(cIndex); if (cell != null) { sb.append( cell.toString()); } } } } } return sb.toString(); } else { System.out.println("找不到指定的文件"); } } catch (Exception e) { e.printStackTrace(); } return ""; } /** * 读取word中的内容 * @param path * @param fileType * @return */ public static String readWord(String path,String fileType) { String buffer = ""; try { if ("doc".equals( fileType )) { InputStream is = new FileInputStream(new File(path)); WordExtractor ex = new WordExtractor(is); buffer = ex.getText(); ex.close(); } else if ("docx".equals( fileType )) { OPCPackage opcPackage = POIXMLDocument.openPackage(path); POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage); buffer = extractor.getText(); extractor.close(); } else { System.out.println("此文件不是word文件!"); } } catch (Exception e) { e.printStackTrace(); } return buffer; } private static String readPPT(String fileType,String filePath) { try { if("ppt".equals( fileType )){ PowerPointExtractor extractor=new PowerPointExtractor(new FileInputStream( new File( filePath ))); return extractor.getText(); } else if("pptx".equals( fileType )){ return new XSLFPowerPointExtractor(POIXMLDocument.openPackage(filePath)).getText(); } }catch (IOException e){ e.fillInStackTrace(); }catch(XmlException e){ e.getMessage(); }catch(OpenXML4JException e){ e.getMessage(); } return ""; }