用到的几个包 bcmail-jdk14-132.jar/bcprov-jdk14-132.jar/checkstyle-all-4.2.jar/FontBox-0.1.0-dev.jar/lucene-core-2.0.0.jar/PDFBox-0.7.3.jar/poi-3.0-alpha3-20061212.jar/poi-contrib-3.0-alpha3-20061212.jar/poi-scratchpad-3.0-alpha3-20061212.jar import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import org.apache.poi.hslf.HSLFSlideShow; import org.apache.poi.hslf.model.Slide; import org.apache.poi.hslf.model.TextRun; import org.apache.poi.hslf.usermodel.SlideShow; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Range; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.util.PDFTextStripper; public class Test { /** * @param args */ public static void p(Object obj) { System.out.println(obj); } public static void main(String[] args) { try { p(readPpt("src/1.dps")); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } //读取ppt public static String readPpt(String path) throws Exception { StringBuffer content = new StringBuffer(""); try { SlideShow ss = new SlideShow(new HSLFSlideShow(path));// path为文件的全路径名称,建立SlideShow Slide[] slides = ss.getSlides();// 获得每一张幻灯片 for (int i = 0; i < slides.length; i++) { TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun for (int j = 0; j < t.length; j++) { content.append(t[j].getText());// 这里会将文字内容加到content中去 } content.append(slides[i].getTitle()); } } catch (Exception ex) { System.out.println(ex.toString()); } return content.toString().trim(); } // 读取xls public static String readXls(String path) throws Exception { StringBuffer content = new StringBuffer("");// 文档内容 HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(path)); int sheetCount = workbook.getNumberOfSheets();// excel几张表 for (int i = 0; i < sheetCount; i++) {// 遍历excel表 HSSFSheet sheet = workbook.getSheetAt(i);// 对excel的第一个表引用 int rowCount = sheet.getLastRowNum();// 取得最后一行的下标 for (int j = 0; j < rowCount; j++) {// 循环每一行 HSSFRow row = sheet.getRow(j);// 引用行 if (row == null) { continue; } else { short cellNum = row.getLastCellNum(); for (short m = 0; m < cellNum; m++) { HSSFCell cell = row.getCell(m);// 引用行中的一个单元格 if (cell != null) { int cellType = cell.getCellType(); // CELL_TYPE_NUMERIC 0 数字 // CELL_TYPE_STRING 1 字符串 // CELL_TYPE_FORMULA 2 公式 // CELL_TYPE_BLANK 3 空格 // CELL_TYPE_BOOLEAN 4 布尔值 // CELL_TYPE_ERROR 5 错误 switch (cellType) { // 单元格类型为数字 case HSSFCell.CELL_TYPE_NUMERIC: // 取数字单元格的值 double d = cell.getNumericCellValue(); content.append(String.valueOf(d) + " "); break; // 单元格类型为字符串 case HSSFCell.CELL_TYPE_STRING: String str = cell.getStringCellValue().trim(); if (!str.equals("")) { content.append(str + " "); } break; // 单元格类型为公式 case HSSFCell.CELL_TYPE_FORMULA: // 不读取公式 // String formula = cell.getCellFormula(); // content = content + formula+" "; break; // 单元格类型为空白 case HSSFCell.CELL_TYPE_BLANK: break; // 单元格类型为布尔值 case HSSFCell.CELL_TYPE_BOOLEAN: // boolean bool = cell.getBooleanCellValue(); // content = content + bool+" "; break; // 单元格类型为错误 case HSSFCell.CELL_TYPE_ERROR: // byte errorCode = cell.getErrorCellValue(); // content = content + errorCode+" "; break; default: break; } } else { // content = content + "..." +" ";//没有数据的单元格使用...填充 } } } content.append(" "); } } return content.toString().trim(); } // 读取pdf public static String readPdf(String path) throws Exception { StringBuffer content = new StringBuffer("");// 文档内容 FileInputStream fis = new FileInputStream(path); PDFParser p = new PDFParser(fis); p.parse(); PDFTextStripper ts = new PDFTextStripper(); content.append(ts.getText(p.getPDDocument())); fis.close(); return content.toString().trim(); } // 读取word,只能读取文本内容 图片不行 public static String readWord(String path) throws Exception { StringBuffer content = new StringBuffer("");// 文档内容 HWPFDocument doc = new HWPFDocument(new FileInputStream(path)); Range range = doc.getRange(); int paragraphCount = range.numParagraphs();// 段落 for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据 Paragraph pp = range.getParagraph(i); content.append(pp.text()); } return content.toString().trim(); } // 读取text public static String readTxt(String path) { StringBuffer content = new StringBuffer("");// 文档内容 try { FileReader reader = new FileReader(path); BufferedReader br = new BufferedReader(reader); String s1 = null; while ((s1 = br.readLine()) != null) { content.append(s1 + " "); } br.close(); reader.close(); } catch (IOException e) { e.printStackTrace(); } return content.toString().trim(); } }
来自于:https://www.cnblogs.com/candl/p/3592649.html
读取xls表格:https://juejin.im/entry/5a5f03e76fb9a01cb42c643e
读取excel:http://www.voidcn.com/article/p-akhurrpc-bbh.html