第一步引入依赖
<!--xls--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.17</version> </dependency> <!--xlsx--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.17</version> </dependency> <!--word--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.17</version> </dependency> <!--pdf--> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.18</version> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-elasticsearch</artifactId> </dependency>
第二部创建解析文档的工具类ReadFileConverter
package com.atguigu.servicees.util; import org.apache.commons.io.FileUtils; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.poi.POIXMLDocument; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.xssf.usermodel.XSSFCell; import org.apache.poi.xssf.usermodel.XSSFRow; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.text.NumberFormat; /** * 文件内容读取转换器 */ public class ReadFileConverter { public String getContents(String path) throws Exception { String contents = ""; int index = path.lastIndexOf("."); String file_suffix = path.substring(index + 1).toLowerCase(); if (file_suffix.equalsIgnoreCase("txt") || file_suffix.equalsIgnoreCase("log")) { contents = this.readTXT(path); } else if (file_suffix.equalsIgnoreCase("xls")) { contents = this.readXLS(path); } else if (file_suffix.equalsIgnoreCase("xlsx")) { contents = this.readXLSX(path); } else if (file_suffix.equalsIgnoreCase("doc")) { contents = this.readDOC(path); } else if (file_suffix.equalsIgnoreCase("docx")) { contents = this.readDOCX(path); } else if (file_suffix.equalsIgnoreCase("pdf")) { contents = this.readPDF(path); } return contents; } /** * 解析xls文件内容 * @param file * @return * @throws Exception */ public String readXLS(String file) throws Exception { StringBuilder content = new StringBuilder(); HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(file)); try { for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) { if (null != workbook.getSheetAt(numSheets)) { HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) { if (null != aSheet.getRow(rowNumOfSheet)) { HSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行 for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) { if (null != aRow.getCell(cellNumOfRow)) { HSSFCell aCell = aRow.getCell(cellNumOfRow);// 获得列值 if (this.convertCell(aCell).length() > 0) { content.append(this.convertCell(aCell)); } } content.append(" "); } } } } } } catch (Exception e) { content.append("xls文件格式不对或损坏"); } finally { if (workbook != null) { workbook.close(); } } return content.toString(); } /** * 解析xlsx文件内容 * @param file * @return * @throws Exception */ public String readXLSX(String file) throws Exception { StringBuilder content = new StringBuilder(); XSSFWorkbook workbook = new XSSFWorkbook(file); try { for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) { if (null != workbook.getSheetAt(numSheets)) { XSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) { if (null != aSheet.getRow(rowNumOfSheet)) { XSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行 for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) { if (null != aRow.getCell(cellNumOfRow)) { XSSFCell aCell = aRow.getCell(cellNumOfRow);// 获得列值 if (this.convertCell(aCell).length() > 0) { content.append(this.convertCell(aCell)); } } content.append(" "); } } } } } } catch (Exception e) { content.append("xlsx文件格式不对或损坏"); } finally { if (workbook != null) { workbook.close(); } } return content.toString(); } /** * 解析txt文件内容 * @param file * @return * @throws Exception */ public String readTXT(String file) throws Exception { String contents = ""; try { String encoding = this.get_charset(new File(file)); if (encoding.equalsIgnoreCase("GBK")) { contents = FileUtils.readFileToString(new File(file), "gbk"); } else { contents = FileUtils.readFileToString(new File(file), "utf8"); } } catch (Exception e) { contents = "txt文件格式不对或损坏"; } return contents; } /** * 解析doc文件内容 * @param file * @return * @throws Exception */ public String readDOC(String file) throws Exception { String returnStr; FileInputStream inputStream = new FileInputStream(new File(file)); WordExtractor wordExtractor = new WordExtractor(inputStream); try { returnStr = wordExtractor.getText(); } catch (Exception e) { returnStr = "doc文件格式不对或损坏"; } finally { if (inputStream != null) { inputStream.close(); } } return returnStr; } /** * 解析docx文件内容 * @param file * @return * @throws Exception */ public String readDOCX(String file) throws Exception { String docx; XWPFWordExtractor xwp = new XWPFWordExtractor(POIXMLDocument.openPackage(file)); try { docx = xwp.getText(); } catch (Exception e) { docx = "docx文件格式不对或损坏"; } finally { if (xwp != null) { xwp.close(); } } return docx; } /** * 解析pdf文件内容 * @param file * @return * @throws Exception */ public String readPDF(String file) throws Exception { String result = null; FileInputStream is = null; PDDocument document = null; try { is = new FileInputStream(file); document = PDDocument.load(is); PDFTextStripper stripper = new PDFTextStripper(); result = stripper.getText(document); } catch (Exception e) { result = "pdf文件格式不对或损坏"; } finally { if (is != null) { is.close(); } if (document != null) { document.close(); } } return result; } private String get_charset(File file) throws IOException { String charset = "GBK"; byte[] first3Bytes = new byte[3]; BufferedInputStream bis = null; try { boolean checked = false; bis = new BufferedInputStream(new FileInputStream(file)); bis.mark(0); int read = bis.read(first3Bytes, 0, 3); if (read == -1) return charset; if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) { charset = "UTF-16LE"; checked = true; } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) { charset = "UTF-16BE"; checked = true; } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF) { charset = "UTF-8"; checked = true; } bis.reset(); if (!checked) { int loc = 0; while ((read = bis.read()) != -1) { loc = loc + 1; if (read >= 0xF0) break; if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK break; if (0xC0 <= read && read <= 0xDF) { read = bis.read(); if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF) continue; else break; } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小 read = bis.read(); if (0x80 <= read && read <= 0xBF) { read = bis.read(); if (0x80 <= read && read <= 0xBF) { charset = "UTF-8"; break; } else break; } else break; } } } } catch (Exception e) { e.printStackTrace(); } finally { if (bis != null) { bis.close(); } } return charset; } /** * 解析excel需要使用的工具类 * @param cell * @return */ private String convertCell(Cell cell) { NumberFormat formater = NumberFormat.getInstance(); formater.setGroupingUsed(false); String cellValue = ""; if (cell == null) { return cellValue; } switch (cell.getCellTypeEnum()) { case NUMERIC: cellValue = formater.format(cell.getNumericCellValue()); break; case STRING: cellValue = cell.getStringCellValue(); break; case BLANK: cellValue = cell.getStringCellValue(); break; case BOOLEAN: cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString(); break; case ERROR: cellValue = String.valueOf(cell.getErrorCellValue()); break; default: cellValue = ""; } return cellValue.trim(); } }