• poi解析office文档内容的工具类


    第一步引入依赖

           <!--xls-->
            <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi</artifactId>
                <version>3.17</version>
            </dependency>
    
            <!--xlsx-->
            <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi-ooxml</artifactId>
                <version>3.17</version>
            </dependency>
    
            <!--word-->
            <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi-scratchpad</artifactId>
               <version>3.17</version>
            </dependency>
    
            <!--pdf-->
            <dependency>
                <groupId>org.apache.pdfbox</groupId>
                <artifactId>pdfbox</artifactId>
                <version>2.0.18</version>
            </dependency>
    
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-data-elasticsearch</artifactId>
            </dependency>
    

    第二部创建解析文档的工具类ReadFileConverter

    package com.atguigu.servicees.util;
    
    import org.apache.commons.io.FileUtils;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.text.PDFTextStripper;
    import org.apache.poi.POIXMLDocument;
    import org.apache.poi.hssf.usermodel.HSSFCell;
    import org.apache.poi.hssf.usermodel.HSSFRow;
    import org.apache.poi.hssf.usermodel.HSSFSheet;
    import org.apache.poi.hssf.usermodel.HSSFWorkbook;
    import org.apache.poi.hwpf.extractor.WordExtractor;
    import org.apache.poi.ss.usermodel.Cell;
    import org.apache.poi.xssf.usermodel.XSSFCell;
    import org.apache.poi.xssf.usermodel.XSSFRow;
    import org.apache.poi.xssf.usermodel.XSSFSheet;
    import org.apache.poi.xssf.usermodel.XSSFWorkbook;
    import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
    import java.io.BufferedInputStream;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.IOException;
    import java.text.NumberFormat;
    
    /**
     * 文件内容读取转换器
     */
    public class ReadFileConverter {
    
        public String getContents(String path) throws Exception {
            String contents = "";
            int index = path.lastIndexOf(".");
            String file_suffix = path.substring(index + 1).toLowerCase();
            if (file_suffix.equalsIgnoreCase("txt") || file_suffix.equalsIgnoreCase("log")) {
                contents = this.readTXT(path);
            } else if (file_suffix.equalsIgnoreCase("xls")) {
                contents = this.readXLS(path);
            } else if (file_suffix.equalsIgnoreCase("xlsx")) {
                contents = this.readXLSX(path);
            } else if (file_suffix.equalsIgnoreCase("doc")) {
                contents = this.readDOC(path);
            } else if (file_suffix.equalsIgnoreCase("docx")) {
                contents = this.readDOCX(path);
            } else if (file_suffix.equalsIgnoreCase("pdf")) {
                contents = this.readPDF(path);
            }
            return contents;
        }
    
    
        /**
         * 解析xls文件内容
         * @param file
         * @return
         * @throws Exception
         */
        public String readXLS(String file) throws Exception {
            StringBuilder content = new StringBuilder();
            HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(file));
            try {
                for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
                    if (null != workbook.getSheetAt(numSheets)) {
                        HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
                        for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {
                            if (null != aSheet.getRow(rowNumOfSheet)) {
                                HSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行
                                for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {
                                    if (null != aRow.getCell(cellNumOfRow)) {
                                        HSSFCell aCell = aRow.getCell(cellNumOfRow);// 获得列值
                                        if (this.convertCell(aCell).length() > 0) {
                                            content.append(this.convertCell(aCell));
                                        }
                                    }
                                    content.append("
    ");
                                }
                            }
                        }
                    }
                }
            } catch (Exception e) {
                content.append("xls文件格式不对或损坏");
            } finally {
                if (workbook != null) {
                    workbook.close();
                }
            }
            return content.toString();
        }
    
    
        /**
         * 解析xlsx文件内容
         * @param file
         * @return
         * @throws Exception
         */
        public String readXLSX(String file) throws Exception {
            StringBuilder content = new StringBuilder();
            XSSFWorkbook workbook = new XSSFWorkbook(file);
            try {
                for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
                    if (null != workbook.getSheetAt(numSheets)) {
                        XSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
                        for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {
                            if (null != aSheet.getRow(rowNumOfSheet)) {
                                XSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行
                                for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {
                                    if (null != aRow.getCell(cellNumOfRow)) {
                                        XSSFCell aCell = aRow.getCell(cellNumOfRow);// 获得列值
                                        if (this.convertCell(aCell).length() > 0) {
                                            content.append(this.convertCell(aCell));
                                        }
                                    }
                                    content.append("
    ");
                                }
                            }
                        }
                    }
                }
            } catch (Exception e) {
                content.append("xlsx文件格式不对或损坏");
            } finally {
                if (workbook != null) {
                    workbook.close();
                }
            }
            return content.toString();
        }
    
    
        /**
         * 解析txt文件内容
         * @param file
         * @return
         * @throws Exception
         */
        public String readTXT(String file) throws Exception {
            String contents = "";
            try {
                String encoding = this.get_charset(new File(file));
                if (encoding.equalsIgnoreCase("GBK")) {
                    contents = FileUtils.readFileToString(new File(file), "gbk");
                } else {
                    contents = FileUtils.readFileToString(new File(file), "utf8");
                }
            } catch (Exception e) {
                contents = "txt文件格式不对或损坏";
            }
            return contents;
        }
    
        /**
         * 解析doc文件内容
         * @param file
         * @return
         * @throws Exception
         */
        public String readDOC(String file) throws Exception {
            String returnStr;
            FileInputStream inputStream = new FileInputStream(new File(file));
            WordExtractor wordExtractor = new WordExtractor(inputStream);
            try {
                returnStr = wordExtractor.getText();
            } catch (Exception e) {
                returnStr = "doc文件格式不对或损坏";
            } finally {
                if (inputStream != null) {
                    inputStream.close();
                }
            }
            return returnStr;
        }
    
    
        /**
         * 解析docx文件内容
         * @param file
         * @return
         * @throws Exception
         */
        public String readDOCX(String file) throws Exception {
            String docx;
            XWPFWordExtractor xwp = new XWPFWordExtractor(POIXMLDocument.openPackage(file));
            try {
                docx = xwp.getText();
            } catch (Exception e) {
                docx = "docx文件格式不对或损坏";
            } finally {
                if (xwp != null) {
                    xwp.close();
                }
            }
            return docx;
        }
    
    
        /**
         * 解析pdf文件内容
         * @param file
         * @return
         * @throws Exception
         */
        public String readPDF(String file) throws Exception {
            String result = null;
            FileInputStream is = null;
            PDDocument document = null;
            try {
                is = new FileInputStream(file);
                document = PDDocument.load(is);
                PDFTextStripper stripper = new PDFTextStripper();
                result = stripper.getText(document);
            } catch (Exception e) {
                result = "pdf文件格式不对或损坏";
            } finally {
                if (is != null) {
                    is.close();
                }
                if (document != null) {
                    document.close();
                }
            }
            return result;
        }
    
        private String get_charset(File file) throws IOException {
            String charset = "GBK";
            byte[] first3Bytes = new byte[3];
            BufferedInputStream bis = null;
            try {
                boolean checked = false;
                bis = new BufferedInputStream(new FileInputStream(file));
                bis.mark(0);
                int read = bis.read(first3Bytes, 0, 3);
                if (read == -1)
                    return charset;
                if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
                    charset = "UTF-16LE";
                    checked = true;
                } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) {
                    charset = "UTF-16BE";
                    checked = true;
                } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF) {
                    charset = "UTF-8";
                    checked = true;
                }
                bis.reset();
                if (!checked) {
                    int loc = 0;
                    while ((read = bis.read()) != -1) {
                        loc = loc + 1;
                        if (read >= 0xF0)
                            break;
                        if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK
                            break;
                        if (0xC0 <= read && read <= 0xDF) {
                            read = bis.read();
                            if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
                                continue;
                            else
                                break;
                        } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小
                            read = bis.read();
                            if (0x80 <= read && read <= 0xBF) {
                                read = bis.read();
                                if (0x80 <= read && read <= 0xBF) {
                                    charset = "UTF-8";
                                    break;
                                } else
                                    break;
                            } else
                                break;
                        }
                    }
                }
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                if (bis != null) {
                    bis.close();
                }
            }
            return charset;
        }
    
        /**
         * 解析excel需要使用的工具类
         * @param cell
         * @return
         */
        private String convertCell(Cell cell) {
            NumberFormat formater = NumberFormat.getInstance();
            formater.setGroupingUsed(false);
            String cellValue = "";
            if (cell == null) {
                return cellValue;
            }
            switch (cell.getCellTypeEnum()) {
                case NUMERIC:
                    cellValue = formater.format(cell.getNumericCellValue());
                    break;
                case STRING:
                    cellValue = cell.getStringCellValue();
                    break;
                case BLANK:
                    cellValue = cell.getStringCellValue();
                    break;
                case BOOLEAN:
                    cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString();
                    break;
                case ERROR:
                    cellValue = String.valueOf(cell.getErrorCellValue());
                    break;
                default:
                    cellValue = "";
            }
            return cellValue.trim();
        }
    
    }
    小蘑菇
  • 相关阅读:
    Python修饰器 ,控制授权,通过ini配置文件 使用密钥 给函数限制试用期和过期后试用次数
    excel vba 自定义函数 使用正则表达式提取字符串
    python 值比较判断,np.nan is np.nan 却 np.nan != np.nan ,pandas 单个数据框/单元格 值判断nan
    python 读取中文CSV 'gbk' codec can't decode bytes in position 2-3:illegal multibyte sequence
    python ipython [Errno 22] invalid mode ('rb') or filename 、IDE工作路径
    windows下 python 添加PYTHONPATH 环境变量
    pandas(python2) 读取中文数据,处理中文列名
    qq邮箱 微信提醒不通知
    python :import error
    python推荐淘宝物美价廉商品 2.0
  • 原文地址:https://www.cnblogs.com/wang66a/p/13515746.html
Copyright © 2020-2023  润新知