• 工具类_JavaPOI_Office文件内容读取


    文件内容读取工具类,亲测可用

    maven依赖:

    <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi</artifactId>
                <version>3.16</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi-ooxml</artifactId>
                <version>3.16</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi-scratchpad</artifactId>
                <version>3.16</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.xmlbeans</groupId>
                <artifactId>xmlbeans</artifactId>
                <version>2.6.0</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.commons</groupId>
                <artifactId>commons-collections4</artifactId>
                <version>4.1</version>
            </dependency>

    工具类:

    import org.apache.commons.io.FileUtils;
    import org.apache.pdfbox.io.RandomAccessBuffer;
    import org.apache.pdfbox.pdfparser.PDFParser;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.text.PDFTextStripper;
    import org.apache.poi.POIXMLDocument;
    import org.apache.poi.POIXMLTextExtractor;
    import org.apache.poi.hslf.extractor.PowerPointExtractor;
    import org.apache.poi.hssf.usermodel.HSSFWorkbook;
    import org.apache.poi.hwpf.extractor.WordExtractor;
    import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
    import org.apache.poi.openxml4j.opc.OPCPackage;
    import org.apache.poi.ss.usermodel.Cell;
    import org.apache.poi.ss.usermodel.Row;
    import org.apache.poi.ss.usermodel.Sheet;
    import org.apache.poi.ss.usermodel.Workbook;
    import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
    import org.apache.poi.xssf.usermodel.XSSFWorkbook;
    import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
    import org.apache.xmlbeans.XmlException;
    
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.IOException;
    import java.io.InputStream;
    
    /**
     * @author wangshuaijun
     * @description
     * 读取文件工具类:支持以下文件内容读取
     * 1. word(.doc),word(.docx)
     * 2. excel(.xls),excel(xlsx)
     * 3. pdf
     * 4. txt
     * 5. ppt(.ppt),pptx(,pptx)
     * @date 2019年4月19日10:52:45
     *
     */
    public class ReadFileUtils {
    
    
        /**
         * 根据文件类型返回文件内容
         * @param filepath
         * @return
         * @throws IOException
         */
        public static String getContentByPath(String filepath) throws IOException{
            String []fileTypeArr=filepath.split( "\." );
            String fileType=fileTypeArr[fileTypeArr.length-1];
            if("doc".equals( fileType ) || "docx".equals( fileType )){
                return readWord( filepath,fileType );
            }else if("xlsx".equals( fileType ) || "xls".equals( fileType )){
                return  readExcel( fileType,filepath );
            }else if("txt".equals( fileType )){
                return readTxt(filepath);
            }else if("pdf".equals( fileType )){
                return readPdf(filepath);
            }else if("ppt".equals( fileType ) || "pptx".equals( fileType )){
                return readPPT(fileType,filepath);
            }else{
                System.out.println("不支持的文件类型!");
            }
            return "";
        }
    
        /**
         * 读取PDF中的内容
         * @param filePath
         * @return
         */
        public static String readPdf(String filePath){
            FileInputStream fileInputStream=null;
            PDDocument pdDocument=null;
            String content="";
            try {
                //创建输入流对象
                fileInputStream = new FileInputStream(filePath);
                //创建解析器对象
                PDFParser pdfParser = new PDFParser(new RandomAccessBuffer(fileInputStream));
                pdfParser.parse();
                //pdf文档
                pdDocument = pdfParser.getPDDocument();
                //pdf文本操作对象,使用该对象可以获取所读取pdf的一些信息
                PDFTextStripper pdfTextStripper = new PDFTextStripper();
                content = pdfTextStripper.getText(pdDocument);
            }catch(IOException e){
                e.printStackTrace();
            }finally{
                try {
                    //PDDocument对象时使用完后必须要关闭
                    if(null!=pdDocument){
                        pdDocument.close();
                    }
                    if(null!=fileInputStream){
                        fileInputStream.close();
                    }
                }catch (IOException e){
                    e.printStackTrace();
                }
            }
            return content;
        }
    
        /**
         * 读取Excel中的内容
         * @param filePath
         * @return
         * @throws IOException
         */
        private static String readTxt(String filePath) throws IOException{
            File f = new File(filePath);
            return FileUtils.readFileToString( f,"GBK" );
        }
    
        /**
         * 读取Excel中的内容
         * @param filePath
         * @return
         */
        private static String readExcel(String fileType,String filePath){
    
            try {
                File excel = new File(filePath);
                if (excel.isFile() && excel.exists()) {   //判断文件是否存在
                    Workbook wb;
                    //根据文件后缀(xls/xlsx)进行判断
                    if ( "xls".equals(fileType)){
                        FileInputStream fis = new FileInputStream(excel);   //文件流对象
                        wb = new HSSFWorkbook(fis);
                    }else if ("xlsx".equals(fileType)){
                        wb = new XSSFWorkbook(excel);
                    }else {
                        System.out.println("文件类型错误!");
                        return "";
                    }
                    //开始解析,获取页签数
                    StringBuffer sb=new StringBuffer("");
                    for(int i=0;i<wb.getNumberOfSheets();i++){
                        Sheet sheet = wb.getSheetAt(i);     //读取sheet
                        sb.append( sheet.getSheetName() +"_");
                        int firstRowIndex = sheet.getFirstRowNum()+1;   //第一行是列名,所以不读
                        int lastRowIndex = sheet.getLastRowNum();
                        for(int rIndex = firstRowIndex; rIndex <= lastRowIndex; rIndex++) {   //遍历行
                            Row row = sheet.getRow(rIndex);
                            if (row != null) {
                                int firstCellIndex = row.getFirstCellNum();
                                int lastCellIndex = row.getLastCellNum();
                                for (int cIndex = firstCellIndex; cIndex < lastCellIndex; cIndex++) {   //遍历列
                                    Cell cell = row.getCell(cIndex);
                                    if (cell != null) {
                                        sb.append( cell.toString());
                                    }
                                }
                            }
                        }
                    }
                    return sb.toString();
                } else {
                    System.out.println("找不到指定的文件");
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
            return "";
        }
    
        /**
         * 读取word中的内容
         * @param path
         * @param fileType
         * @return
         */
        public static String readWord(String path,String fileType) {
            String buffer = "";
            try {
                if ("doc".equals( fileType )) {
                    InputStream is = new FileInputStream(new File(path));
                    WordExtractor ex = new WordExtractor(is);
                    buffer = ex.getText();
                    ex.close();
                } else if ("docx".equals( fileType )) {
                    OPCPackage opcPackage = POIXMLDocument.openPackage(path);
                    POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
                    buffer = extractor.getText();
                    extractor.close();
    
                } else {
                    System.out.println("此文件不是word文件!");
                }
    
            } catch (Exception e) {
                e.printStackTrace();
            }
    
            return buffer;
        }
    
        private static String readPPT(String fileType,String filePath)  {
            try {
                if("ppt".equals( fileType )){
                    PowerPointExtractor extractor=new PowerPointExtractor(new FileInputStream( new File( filePath )));
                    return extractor.getText();
                } else if("pptx".equals( fileType )){
                    return new XSLFPowerPointExtractor(POIXMLDocument.openPackage(filePath)).getText();
                }
            }catch (IOException e){
                e.fillInStackTrace();
            }catch(XmlException e){
                e.getMessage();
            }catch(OpenXML4JException e){
                e.getMessage();
            }
    
            return "";
        }
  • 相关阅读:
    JS定时循环
    JS分组
    中位数 题解
    NOIP2017 D2T3 题解
    CF949E Binary Cards 题解
    友善的树形DP
    300英雄的危机(heroes)
    [北京省选集训2019]图的难题 题解
    洛谷 P1268 树的重量 题解
    洛谷 P2633 Count on a tree 题解
  • 原文地址:https://www.cnblogs.com/10158wsj/p/11063488.html
Copyright © 2020-2023  润新知