poi解析office文档内容的工具类

第一步引入依赖

       <!--xls-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.17</version>
        </dependency>

        <!--xlsx-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.17</version>
        </dependency>

        <!--word-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
           <version>3.17</version>
        </dependency>

        <!--pdf-->
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.18</version>
        </dependency>

        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-elasticsearch</artifactId>
        </dependency>

第二部创建解析文档的工具类ReadFileConverter

package com.atguigu.servicees.util;

import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.text.NumberFormat;

/**
 * 文件内容读取转换器
 */
public class ReadFileConverter {

    public String getContents(String path) throws Exception {
        String contents = "";
        int index = path.lastIndexOf(".");
        String file_suffix = path.substring(index + 1).toLowerCase();
        if (file_suffix.equalsIgnoreCase("txt") || file_suffix.equalsIgnoreCase("log")) {
            contents = this.readTXT(path);
        } else if (file_suffix.equalsIgnoreCase("xls")) {
            contents = this.readXLS(path);
        } else if (file_suffix.equalsIgnoreCase("xlsx")) {
            contents = this.readXLSX(path);
        } else if (file_suffix.equalsIgnoreCase("doc")) {
            contents = this.readDOC(path);
        } else if (file_suffix.equalsIgnoreCase("docx")) {
            contents = this.readDOCX(path);
        } else if (file_suffix.equalsIgnoreCase("pdf")) {
            contents = this.readPDF(path);
        }
        return contents;
    }


    /**
     * 解析xls文件内容
     * @param file
     * @return
     * @throws Exception
     */
    public String readXLS(String file) throws Exception {
        StringBuilder content = new StringBuilder();
        HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(file));
        try {
            for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
                if (null != workbook.getSheetAt(numSheets)) {
                    HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
                    for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {
                        if (null != aSheet.getRow(rowNumOfSheet)) {
                            HSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行
                            for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {
                                if (null != aRow.getCell(cellNumOfRow)) {
                                    HSSFCell aCell = aRow.getCell(cellNumOfRow);// 获得列值
                                    if (this.convertCell(aCell).length() > 0) {
                                        content.append(this.convertCell(aCell));
                                    }
                                }
                                content.append("
");
                            }
                        }
                    }
                }
            }
        } catch (Exception e) {
            content.append("xls文件格式不对或损坏");
        } finally {
            if (workbook != null) {
                workbook.close();
            }
        }
        return content.toString();
    }


    /**
     * 解析xlsx文件内容
     * @param file
     * @return
     * @throws Exception
     */
    public String readXLSX(String file) throws Exception {
        StringBuilder content = new StringBuilder();
        XSSFWorkbook workbook = new XSSFWorkbook(file);
        try {
            for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
                if (null != workbook.getSheetAt(numSheets)) {
                    XSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
                    for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {
                        if (null != aSheet.getRow(rowNumOfSheet)) {
                            XSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行
                            for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {
                                if (null != aRow.getCell(cellNumOfRow)) {
                                    XSSFCell aCell = aRow.getCell(cellNumOfRow);// 获得列值
                                    if (this.convertCell(aCell).length() > 0) {
                                        content.append(this.convertCell(aCell));
                                    }
                                }
                                content.append("
");
                            }
                        }
                    }
                }
            }
        } catch (Exception e) {
            content.append("xlsx文件格式不对或损坏");
        } finally {
            if (workbook != null) {
                workbook.close();
            }
        }
        return content.toString();
    }


    /**
     * 解析txt文件内容
     * @param file
     * @return
     * @throws Exception
     */
    public String readTXT(String file) throws Exception {
        String contents = "";
        try {
            String encoding = this.get_charset(new File(file));
            if (encoding.equalsIgnoreCase("GBK")) {
                contents = FileUtils.readFileToString(new File(file), "gbk");
            } else {
                contents = FileUtils.readFileToString(new File(file), "utf8");
            }
        } catch (Exception e) {
            contents = "txt文件格式不对或损坏";
        }
        return contents;
    }

    /**
     * 解析doc文件内容
     * @param file
     * @return
     * @throws Exception
     */
    public String readDOC(String file) throws Exception {
        String returnStr;
        FileInputStream inputStream = new FileInputStream(new File(file));
        WordExtractor wordExtractor = new WordExtractor(inputStream);
        try {
            returnStr = wordExtractor.getText();
        } catch (Exception e) {
            returnStr = "doc文件格式不对或损坏";
        } finally {
            if (inputStream != null) {
                inputStream.close();
            }
        }
        return returnStr;
    }


    /**
     * 解析docx文件内容
     * @param file
     * @return
     * @throws Exception
     */
    public String readDOCX(String file) throws Exception {
        String docx;
        XWPFWordExtractor xwp = new XWPFWordExtractor(POIXMLDocument.openPackage(file));
        try {
            docx = xwp.getText();
        } catch (Exception e) {
            docx = "docx文件格式不对或损坏";
        } finally {
            if (xwp != null) {
                xwp.close();
            }
        }
        return docx;
    }


    /**
     * 解析pdf文件内容
     * @param file
     * @return
     * @throws Exception
     */
    public String readPDF(String file) throws Exception {
        String result = null;
        FileInputStream is = null;
        PDDocument document = null;
        try {
            is = new FileInputStream(file);
            document = PDDocument.load(is);
            PDFTextStripper stripper = new PDFTextStripper();
            result = stripper.getText(document);
        } catch (Exception e) {
            result = "pdf文件格式不对或损坏";
        } finally {
            if (is != null) {
                is.close();
            }
            if (document != null) {
                document.close();
            }
        }
        return result;
    }

    private String get_charset(File file) throws IOException {
        String charset = "GBK";
        byte[] first3Bytes = new byte[3];
        BufferedInputStream bis = null;
        try {
            boolean checked = false;
            bis = new BufferedInputStream(new FileInputStream(file));
            bis.mark(0);
            int read = bis.read(first3Bytes, 0, 3);
            if (read == -1)
                return charset;
            if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
                charset = "UTF-16LE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) {
                charset = "UTF-16BE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF) {
                charset = "UTF-8";
                checked = true;
            }
            bis.reset();
            if (!checked) {
                int loc = 0;
                while ((read = bis.read()) != -1) {
                    loc = loc + 1;
                    if (read >= 0xF0)
                        break;
                    if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的，也算是GBK
                        break;
                    if (0xC0 <= read && read <= 0xDF) {
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
                            continue;
                        else
                            break;
                    } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错，但是几率较小
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) {
                            read = bis.read();
                            if (0x80 <= read && read <= 0xBF) {
                                charset = "UTF-8";
                                break;
                            } else
                                break;
                        } else
                            break;
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (bis != null) {
                bis.close();
            }
        }
        return charset;
    }

    /**
     * 解析excel需要使用的工具类
     * @param cell
     * @return
     */
    private String convertCell(Cell cell) {
        NumberFormat formater = NumberFormat.getInstance();
        formater.setGroupingUsed(false);
        String cellValue = "";
        if (cell == null) {
            return cellValue;
        }
        switch (cell.getCellTypeEnum()) {
            case NUMERIC:
                cellValue = formater.format(cell.getNumericCellValue());
                break;
            case STRING:
                cellValue = cell.getStringCellValue();
                break;
            case BLANK:
                cellValue = cell.getStringCellValue();
                break;
            case BOOLEAN:
                cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString();
                break;
            case ERROR:
                cellValue = String.valueOf(cell.getErrorCellValue());
                break;
            default:
                cellValue = "";
        }
        return cellValue.trim();
    }

}

小蘑菇

相关阅读:
Python修饰器，控制授权，通过ini配置文件使用密钥给函数限制试用期和过期后试用次数
 excel vba 自定义函数使用正则表达式提取字符串
 python 值比较判断，np.nan is np.nan 却 np.nan != np.nan ,pandas 单个数据框/单元格值判断nan
python 读取中文CSV 'gbk' codec can't decode bytes in position 2-3：illegal multibyte sequence
python ipython [Errno 22] invalid mode ('rb') or filename 、IDE工作路径
 windows下 python 添加PYTHONPATH 环境变量
 pandas（python2）读取中文数据，处理中文列名
 qq邮箱微信提醒不通知
 python :import error
python推荐淘宝物美价廉商品 2.0
原文地址：https://www.cnblogs.com/wang66a/p/13515746.html