• Java读取doc、docx、xls、xlsx、ppt、pptx、pdf文件内容


    读取文件信息所需依赖

    <!-- 读取Excel XLS -->
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi</artifactId>
        <version>4.1.2</version>
    </dependency>
    <!-- 读取PPT、DOC、Visio -->
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-scratchpad</artifactId>
        <version>4.1.2</version>
    </dependency>
    <!-- 读取Excel XLSX、PPTX、DOCX、-->
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-ooxml</artifactId>
        <version>4.1.2</version>
    </dependency>
    <!--读取pdf信息-->
    <dependency>
        <groupId>org.apache.pdfbox</groupId>
        <artifactId>pdfbox</artifactId>
        <version>2.0.12</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/fontbox -->
    <dependency>
        <groupId>org.apache.pdfbox</groupId>
        <artifactId>fontbox</artifactId>
        <version>2.0.12</version>
    </dependency>
    

    读取doc文件内容

    public static String readWord(String name)
    {
        FileInputStream in;
        String text = null;
        try 
        {
            in = new FileInputStream(name);
            WordExtractor extractor = new WordExtractor(in);
            text = extractor.getText();
        } 
        catch (FileNotFoundException e) 
        {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return text;
    }
    

    读取docx文件内容

    public static String readDoc(MultipartFile file) {
        if (file.isEmpty())return "";
        WordExtractor wordExtractor = null;
        try {
            InputStream inputStream = file.getInputStream();
            wordExtractor = new WordExtractor(inputStream);
        } catch (IOException e) {
            log.warn(e.toString());
            e.printStackTrace();
        }
        return wordExtractor.getText();
    }
    

    读取xls文件内容

    public static String readXls(MultipartFile file) {
        if (file.isEmpty()) return "";
        StringBuilder content = new StringBuilder();
        try {
            HSSFWorkbook excel = new HSSFWorkbook(file.getInputStream());
            //获取第一个sheet
            HSSFSheet sheet0 = excel.getSheetAt(0);
            for (Iterator rowIterator = sheet0.iterator(); rowIterator.hasNext(); ) {
                HSSFRow row = (HSSFRow) rowIterator.next();
                for (Iterator iterator = row.cellIterator(); iterator.hasNext(); ) {
                    HSSFCell cell = (HSSFCell) iterator.next();
                    //根据单元的的类型 读取相应的结果
                    if (cell.getCellType() == CellType.STRING)
                        content.append(cell.getStringCellValue() + "	");
                    else if (cell.getCellType() == CellType.NUMERIC 
                             					|| cell.getCellType() == CellType.FORMULA)
                        content.append(cell.getNumericCellValue() + "	");
                    else
                        content.append("" + "	");
                }
            }
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            log.warn(e.toString());
        }
        return content.toString();
    }
    

    读取xlsx文件内容

    public static String readXlsx(MultipartFile file) {
        if (file.isEmpty()) return "";
        StringBuilder content = new StringBuilder();
        try {
            XSSFWorkbook excel = new XSSFWorkbook(file.getInputStream());
            //获取第一个sheet
            XSSFSheet sheet0 = excel.getSheetAt(0);
            for (Iterator rowIterator = sheet0.iterator(); rowIterator.hasNext(); ) {
                XSSFRow row = (XSSFRow) rowIterator.next();
                for (Iterator iterator = row.cellIterator(); iterator.hasNext(); ) {
                    XSSFCell cell = (XSSFCell) iterator.next();
                    //根据单元格的类型 读取相应的结果
                    if (cell.getCellType() == CellType.STRING)
                        content.append(cell.getStringCellValue() + "	");
                    else if (cell.getCellType() == CellType.NUMERIC 
                             				|| cell.getCellType() == CellType.FORMULA)
                        content.append(cell.getNumericCellValue() + "	");
                    else
                        content.append("" + "	");
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
            log.warn(e.toString());
        }
        return content.toString();
    }
    

    读取pdf文件内容

    /**
     * 读取 PDF文本内容
     *
     * @Param: MultipartFile
     * @return: pdf文本内容
     */
    public static String readPdf(MultipartFile file) {
        StringBuilder content = new StringBuilder();
        try {
            InputStream is = file.getInputStream();
            PDFParser parser = new PDFParser(new RandomAccessBuffer(is));
            parser.parse();
            // 读取文本内容
            PDDocument document = parser.getPDDocument();
            // 获取页码
            int pages = document.getNumberOfPages();
            PDFTextStripper stripper = new PDFTextStripper();
            // 设置按顺序输出
            stripper.setSortByPosition(true);
            stripper.setStartPage(1);
            stripper.setEndPage(pages);
            content.append(stripper.getText(document));
    
        } catch (Exception e) {
            e.printStackTrace();
            log.warn(e.toString());
        }
        return content.toString();
    }
    

    PDF文件加载有两种方式,无明显差异,方式二代码较简洁:

    // 方式一:         
    InputStream input = null;
    input = new FileInputStream( pdfFile );
    //加载 pdf 文档
    PDFParser parser = new PDFParser(new RandomAccessBuffer(input));
    parser.parse();
    document = parser.getPDDocument();
    
     // 方式二:
    document=PDDocument.load(pdfFile);   
    

    读取ppt文件内容

    public static String readPPT(MultipartFile file) {
        if (file.isEmpty()) return "";
        StringBuilder content = new StringBuilder();
        try {
            InputStream is = file.getInputStream();
            HSLFSlideShow hslfSlideShow = new HSLFSlideShow(is);
            List<HSLFSlide> slides = hslfSlideShow.getSlides();
            SlideShowExtractor slideShowExtractor = new SlideShowExtractor(hslfSlideShow);
    
            for (HSLFSlide slide : slides) {
                content.append(slideShowExtractor.getText(slide));
            }
            slideShowExtractor.close();
        } catch (IOException e) {
            log.warn(e.toString());
            e.printStackTrace();
        }
        return content.toString();
    }
    

    读取pptx文件内容

    public static String readPPTX(MultipartFile file) {
        if (file.isEmpty()) return "";
        StringBuffer content = new StringBuffer();
        try {
            InputStream is = file.getInputStream();
            XMLSlideShow xmlSlideShow = new XMLSlideShow(is);
            List<XSLFSlide> slides = xmlSlideShow.getSlides();            //获得每一张幻灯片
            for (XSLFSlide slide : slides) {
                CTSlide rawSlide = slide.getXmlObject();
                CTGroupShape spTree = rawSlide.getCSld().getSpTree();
                List<CTShape> spList = spTree.getSpList();
                for (CTShape shape : spList) {
                    CTTextBody txBody = shape.getTxBody();
                    if (null == txBody) {
                        continue;
                    }
                    List<CTTextParagraph> pList = txBody.getPList();
                    for (CTTextParagraph textParagraph : pList) {
                        List<CTRegularTextRun> textRuns = textParagraph.getRList();
                        for (CTRegularTextRun textRun : textRuns) {
                            content.append(textRun.getT());
                        }
                    }
                }
            }
            xmlSlideShow.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return content.toString();
    }
    
  • 相关阅读:
    CefSharp应用——High DPI问题
    CefSharp应用——程序输出
    CefSharp应用——环境搭建
    QTTabBar加载项被禁用
    OCR 中文汉字识别,可用于文档识别,身份证识别,名片识别,采用字库+卷积神经网络
    springboot中Thymeleaf和Freemarker模板引擎的区别
    一种mysql 实现用户前两条语句方案
    Elasticsearch java.lang.ClassNotFoundException: org.elasticsearch.common.transport.InetSocketTransportAddress
    版本6.2.4的elasticsearch包里面没有InetSocketTransportAddress
    ES spring数据JPA&spring data elasticsearch;找不到类型的属性索引
  • 原文地址:https://www.cnblogs.com/code-duck/p/13744888.html
Copyright © 2020-2023  润新知