• Java读取各种文件格式内容


    所需的jar包哦也不要太记得了,大家可以搜搜,直接上代码:

    import java.io.BufferedInputStream;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.IOException;
    import java.text.NumberFormat;
    
    import org.apache.commons.io.FileUtils;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.text.PDFTextStripper;
    import org.apache.poi.POIXMLDocument;
    import org.apache.poi.hssf.usermodel.HSSFCell;
    import org.apache.poi.hssf.usermodel.HSSFRow;
    import org.apache.poi.hssf.usermodel.HSSFSheet;
    import org.apache.poi.hssf.usermodel.HSSFWorkbook;
    import org.apache.poi.hwpf.extractor.WordExtractor;
    import org.apache.poi.ss.usermodel.Cell;
    import org.apache.poi.xssf.usermodel.XSSFCell;
    import org.apache.poi.xssf.usermodel.XSSFRow;
    import org.apache.poi.xssf.usermodel.XSSFSheet;
    import org.apache.poi.xssf.usermodel.XSSFWorkbook;
    import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
    
    /**
    *文件内容读取转换器
    */
    public class ReadFileConverter 
    {
    
    public String getContents(String path) throws Exception
    {
      String contents = "";
      int index = path.lastIndexOf(".");
      String file_suffix = path.substring(index+1).toLowerCase();
      if(file_suffix.equalsIgnoreCase("txt")||file_suffix.equalsIgnoreCase("log")){
        contents = this.readTXT(path);
      }
      else if(file_suffix.equalsIgnoreCase("xls")){
        contents = this.readXLS(path);
      }
      else if(file_suffix.equalsIgnoreCase("xlsx")){
        contents = this.readXLSX(path);
      }
      else if(file_suffix.equalsIgnoreCase("doc")){
        contents = this.readDOC(path);
      }
      else if(file_suffix.equalsIgnoreCase("docx")){
        contents = this.readDOCX(path);
      }
      else if(file_suffix.equalsIgnoreCase("pdf")){
        contents = this.readPDF(path);
      }
      return contents;
    }
    
    
    public String readXLS(String file) throws Exception
    {
      StringBuilder content = new StringBuilder();
      HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(file));
      try{
        for(int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++){
          if (null != workbook.getSheetAt(numSheets)){
            HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
            for(int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++){
              if (null != aSheet.getRow(rowNumOfSheet)){
                HSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行
                for(short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++){
                  if (null != aRow.getCell(cellNumOfRow)){
                    HSSFCell aCell = aRow.getCell(cellNumOfRow);// 获得列值
                    if (this.convertCell(aCell).length() > 0){
                      content.append(this.convertCell(aCell));
                     }
                  }
                  content.append("
    ");
                }
              }
            }
          }
        }
      }
      catch(Exception e){
        content.append("xls文件格式不对或损坏");
      }
      finally{
        if(workbook!=null){
          workbook.close();
        }
      }
      return content.toString();
    }
    
    
    public String readXLSX(String file) throws Exception
    {
      StringBuilder content = new StringBuilder();
      XSSFWorkbook workbook = new XSSFWorkbook(file);
      try{
        for(int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++){
          if (null != workbook.getSheetAt(numSheets)){
            XSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
            for(int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++){
              if (null != aSheet.getRow(rowNumOfSheet)){
                XSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行
                for(short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++){
                  if (null != aRow.getCell(cellNumOfRow)){
                    XSSFCell aCell = aRow.getCell(cellNumOfRow);// 获得列值
                    if (this.convertCell(aCell).length() > 0){
                      content.append(this.convertCell(aCell));
                    }
                  }
                  content.append("
    ");
                }
              }
            }
          }
        }
      }catch(Exception e){
        content.append("xlsx文件格式不对或损坏");
      }
      finally{
        if(workbook!=null){
          workbook.close();
        }
      }
      return content.toString();
    }
    
    public String readTXT(String file) throws Exception
    {
      String contents = "";
      try{
        String encoding = this.get_charset(new File(file));
        if (encoding.equalsIgnoreCase("GBK")) {
          contents = FileUtils.readFileToString(new File(file), "gbk");
        } else {
          contents = FileUtils.readFileToString(new File(file), "utf8");
        }
      }catch(Exception e){
        contents = "txt文件格式不对或损坏";
      }
      return contents;
    }
    
    public String readDOC(String file) throws Exception
    {
      String returnStr;
      WordExtractor wordExtractor = new WordExtractor(new FileInputStream(new File(file)));
      try{
        returnStr = wordExtractor.getText();
      }catch(Exception e){
        returnStr="doc文件格式不对或损坏";
      }
      finally{
        if(wordExtractor != null){
          wordExtractor.close();
        }
      }
      return returnStr;
    }
    
    
    public String readDOCX(String file) throws Exception
    {
      String docx;
      XWPFWordExtractor xwp= new XWPFWordExtractor(POIXMLDocument.openPackage(file));
      try{
        docx= xwp.getText();
      }catch(Exception e){
        docx="docx文件格式不对或损坏";
      }
      finally{
        if(xwp !=null){
          xwp.close();
        }
      }
      return docx;
    }
    
    
    public String readPDF(String file) throws Exception
    {
      String result = null;
      FileInputStream is = null;
      PDDocument document = null;
      try{
        is = new FileInputStream(file);
        document = PDDocument.load(is);
        PDFTextStripper stripper = new PDFTextStripper();
        result = stripper.getText(document);
      }catch(Exception e){
        result="pdf文件格式不对或损坏";
      }
      finally{
        if (is != null){
          is.close();
        }
        if (document != null){
          document.close();
        }
      }
      return result;
    }
    
    private String get_charset(File file) throws IOException 
    {
      String charset = "GBK";
      byte[] first3Bytes = new byte[3];
      BufferedInputStream bis = null;
      try {
        boolean checked = false;
        bis = new BufferedInputStream(new FileInputStream(file));
        bis.mark(0);
        int read = bis.read(first3Bytes, 0, 3);
        if (read == -1)
          return charset;
        if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
          charset = "UTF-16LE";
          checked = true;
        } else if (first3Bytes[0] == (byte) 0xFE&& first3Bytes[1] == (byte) 0xFF) {
          charset = "UTF-16BE";
          checked = true;
        } else if (first3Bytes[0] == (byte) 0xEF&& first3Bytes[1] == (byte) 0xBB&& first3Bytes[2] == (byte) 0xBF) {
          charset = "UTF-8";
          checked = true;
        }
        bis.reset();
        if (!checked) {
          // int len = 0;
          int loc = 0;
          while ((read = bis.read()) != -1) {
            loc=loc+1;
            if (read >= 0xF0)
              break;
            if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK
              break;
            if (0xC0 <= read && read <= 0xDF) {
              read = bis.read();
              if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
              // (0x80
              // - 0xBF),也可能在GB编码内
                continue;
              else
                break;
            } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小
              read = bis.read();
              if (0x80 <= read && read <= 0xBF) {
                read = bis.read();
                if (0x80 <= read && read <= 0xBF) {
                  charset = "UTF-8";
                  break;
                } else
                  break;
              } else
                break;
            }
          }
        }
      } catch (Exception e) {
        e.printStackTrace();
      } finally {
        if (bis != null) {
          bis.close();
        }
      }
      return charset;
    }
    
    @SuppressWarnings("deprecation")
    private String convertCell(Cell cell) 
    {
      NumberFormat formater = NumberFormat.getInstance();
      formater.setGroupingUsed(false);
      String cellValue = "";
      if (cell == null) {
        return cellValue;
      }
      switch (cell.getCellTypeEnum()) {
        case NUMERIC:
          cellValue = formater.format(cell.getNumericCellValue());
          break;
        case STRING:
          cellValue = cell.getStringCellValue();
          break;
        case BLANK:
          cellValue = cell.getStringCellValue();
          break;
        case BOOLEAN:
          cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString();
          break;
        case ERROR:
          cellValue = String.valueOf(cell.getErrorCellValue());
          break;
        default:
          cellValue = "";
        }
        return cellValue.trim();
      }
    
    }
    版权声明:如需转载,请注明!PS:如是转载随便,请忽略
  • 相关阅读:
    《数据密集型应用系统设计》读书笔记
    每周总结
    每周总结
    每周总结
    《数据密集型应用系统设计》读书笔记
    每周总结
    《重构》读书笔记
    每周总结
    软件过程与管理知识回顾
    操作系统知识汇总5-6章
  • 原文地址:https://www.cnblogs.com/zwdx/p/7234484.html
Copyright © 2020-2023  润新知