工作中有遇到需要获取上传的Excel文件的列明、最大行数、大小等元数据信息。通常做法是通过Apache的POI工具加载文件然后再读取行列进行处理。这种方法很大的弊端就是需要把excel文件加载到内存,如果遇到大的文件,内存暴增,很容易出现OOM。为了解决这个问题,我研究了excel文件的格式,写了一工具类来自己解析和获取这些信息。
一、excel文件格式解析
其实xls、xlsx格式的文件其实就是一个压缩包,我们找一个excel文件,把后缀改成.rar,然后解压,你会发现文件夹里面大概是这样的:
其中关键的是xl这个文件夹,看第二张图:
1、workbook.xml 里面包含了sheet的信息,比如有几个sheet,每一个的名称是什么
2、sharedString.xml 老重要了,里面就是包含了整个excel文件中单元格中的内容,excel是通过索引来引用内容的。
3、worksheets 文件夹里面包含了sheet内容的定义
看第三张图,sheet1.xml表示第一个sheet的定义,其内容是这样的:
看到那些数字了吗,其实表示这个单元格的内容在sharedString.xml中的索引。
二、示例代码实现
接下来我将展示一个获取excel文件中列名称、行数、sheet名称的java代码。
import java.io.File; import java.io.RandomAccessFile; import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.channels.FileChannel; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * excel文件元数据读取工具 * * @author yuananyun * @date 2017/11/16 14:20 **/ public class ExcelXmlUtil { //获取第一个sheet的名称的表达式 private static Pattern firstSheetPattern = Pattern.compile("sheet name="(.*?)" sheetId="1""); //抽取一行的表达式,如 //<row r="200001" spans="1:2" s="1" customFormat="1" x14ac:dyDescent="0.15">/row> private static Pattern rowPattern = Pattern.compile("<row(.*?)></row>"); //求解一行行号的表达式 private static Pattern rowNumPattern = Pattern.compile("r="(\d+)""); //求解标题列个数的表达式 private static Pattern columnCountPattern = Pattern.compile("</v>"); //求解列标题索引的表达式 private static Pattern columnIndexPattern = Pattern.compile("<v>(\d*)</v>"); //求解列标题名称的表达式 private static Pattern titleValuePattern = Pattern.compile("(?:(?:<t>)|(?:<t xml:space=".*">))([\s\S]*?)</t>"); static class ExcelRowColumnInfo { private long maxRowNum; private int coluntCount; private List<String> titleList; private String firstSheetName; public ExcelRowColumnInfo(String firstSheetName, int maxRowNum, int coluntCount, List<String> titleList) { this.firstSheetName = firstSheetName; this.maxRowNum = maxRowNum; this.coluntCount = coluntCount; this.titleList = titleList; } public long getMaxRowNum() { return maxRowNum; } public void setMaxRowNum(int maxRowNum) { this.maxRowNum = maxRowNum; } public int getColuntCount() { return coluntCount; } public void setColuntCount(int coluntCount) { this.coluntCount = coluntCount; } public List<String> getTitleList() { return titleList == null ? new ArrayList<>() : titleList; } public void setTitleList(List<String> titleList) { this.titleList = titleList; } public String getFirstSheetName() { return firstSheetName; } public void setFirstSheetName(String firstSheetName) { this.firstSheetName = firstSheetName; } @Override public String toString() { return "ExcelRowColumnInfo{" + "maxRowNum=" + maxRowNum + ", coluntCount=" + coluntCount + ", titleList=" + titleList.toString() + '}'; } } /** * 获取excel文件的行列个数 * * @param excelFilePath * @param isOverwrite 是否覆盖源excel文件 * @return ExcelRowColumnInfo */ public static ExcelRowColumnInfo getRowAndColumnInfo(String excelFilePath, boolean isOverwrite) { try { File excelFile = new File(excelFilePath); if (!excelFile.exists()) return null; String zipFilePath = excelFilePath.replace(".xlsx", ".zip").replace(".xls", ".zip"); File zipFile = new File(zipFilePath); if (zipFile.exists()) zipFile.delete(); if (isOverwrite) { //直接重命名 excelFile.renameTo(zipFile); } else { // 复制文件 FileUtil.copyFile(excelFilePath, zipFilePath); } //解压的临时目录 String tmpDir = zipFilePath.replace(".zip", ""); List<File> fileList = ZipUtils.upzipFile(zipFile, tmpDir); File sheet1File = null; File sharedStringsFile = null; File workbookFile = null; for (File file : fileList) { if (file.getPath().contains("sheet1.xml")) sheet1File = file; if (file.getPath().contains("sharedStrings.xml")) sharedStringsFile = file; if (file.getPath().contains("workbook.xml")) workbookFile = file; } if (sheet1File == null || sharedStringsFile == null) return null; //抽取sheet名称 String sheetName = parseFirstSheetName(workbookFile); int[] rcArray = parseMaxRowNumAndColCount(sheet1File); int maxRowNum = rcArray[0]; // int columCount = rcArray[1]; int[] titleIndexArray = parseTitleIndexArray(sheet1File); List<String> titleList = parseTitleList(sharedStringsFile, titleIndexArray); deleteFileRecursively(zipFile); deleteFileRecursively(new File(tmpDir)); if (titleList == null || titleList.size() == 0 || maxRowNum == 0) return null; return new ExcelRowColumnInfo(sheetName, maxRowNum, titleList.size(), titleList); } catch (Exception ex) { ex.printStackTrace(); } return null; } /** * 解析第一个sheet的名称 * * @param workbookFile * @return */ private static String parseFirstSheetName(File workbookFile) { String content = getFileSegment(workbookFile, 0, Integer.MAX_VALUE); Matcher matcher = firstSheetPattern.matcher(content); if (matcher.find()) return matcher.group(1); return null; } /** * 求解标题列关键字所在的索引 * * @param sheet1File * @return */ private static int[] parseTitleIndexArray(File sheet1File) { int realColCount = 0; String startSegment = getFileSegment(sheet1File, 2000); if (startSegment != null) { //求解真实的列数 Matcher matcher = rowPattern.matcher(startSegment); if (matcher.find()) { String firstRow = matcher.group(1); if (firstRow != null) { matcher = columnCountPattern.matcher(firstRow); while (matcher.find()) realColCount++; } } if (realColCount > 0) { //求解标题 int[] titleIndexArray = new int[realColCount]; matcher = columnIndexPattern.matcher(startSegment); int i = 0; while (matcher.find() && i < realColCount) { titleIndexArray[i++] = Integer.parseInt(matcher.group(1)); } return titleIndexArray; } } return null; } /** * 解析excel文件的标题列名称 * * @param sharedStringsFile * @param titleIndexArray * @return */ private static List<String> parseTitleList(File sharedStringsFile, int[] titleIndexArray) { List<String> titleList = new ArrayList<>(); int count = titleIndexArray.length; if (count > 0) { int minIndex = Integer.MAX_VALUE; int maxIndex = Integer.MIN_VALUE; for (int i = 0; i < count; i++) { int index = titleIndexArray[i]; if (index > maxIndex) maxIndex = index; if (index < minIndex) minIndex = index; } //885是头部的长度,限制每个row长度为200字符 // int length = (885 + (maxIndex - minIndex + 1) * 200); //标题真的是到处都在, String[] titleArray = new String[count]; // if (minIndex > 10000) { // //这是一个大文档,整篇加载 // length = Integer.MAX_VALUE; // } String segment = getFileSegment(sharedStringsFile, 0, Integer.MAX_VALUE); Matcher matcher = titleValuePattern.matcher(segment); int i = 0; while (matcher.find() && count > 0) { String value = matcher.group(1); // System.out.println(i + " ------> " + value); for (int j = 0; j < titleIndexArray.length; j++) { if (i == titleIndexArray[j]) { titleArray[j] = value; count--; break; } } i++; } if (titleArray.length > 0) { Collections.addAll(titleList, titleArray); //去掉空格单元格 Collections.reverse(titleList); for (int i1 = 0; i1 < titleList.size(); i1++) { String title = String.valueOf(titleList.get(i1)); if ("".equals(title.trim())) titleList.remove(i1); } Collections.reverse(titleList); } } return titleList; } /** * 解析文件的最大行号和列数 * * @param sheet1File * @return */ private static int[] parseMaxRowNumAndColCount(File sheet1File) { int rowNum = 0, colCount = 0; String endSegment = getFileSegment(sheet1File, -1000); if (endSegment != null) { Matcher matcher = rowPattern.matcher(endSegment); String lastRow = ""; while (matcher.find()) { lastRow = matcher.group(1); } if (lastRow.length() > 0) { matcher = rowNumPattern.matcher(lastRow); if (matcher.find()) rowNum = Integer.parseInt(matcher.group(1)); matcher = columnCountPattern.matcher(lastRow); while (matcher.find()) colCount++; } } return new int[]{rowNum, colCount}; } /** * 递归删除文件及文件夹 * * @param file */ private static void deleteFileRecursively(File file) { if (file.exists()) { if (file.isFile()) { file.delete(); } else if (file.isDirectory()) { File[] files = file.listFiles(); for (int i = 0; i < files.length; i++) { deleteFileRecursively(files[i]); } file.delete(); } } } private static String getFileSegment(File file, int length) { return getFileSegment(file, 0, length); } /** * 从一个文件中截取一段字符串 * * @param file * @param offset * @param length length<0时,offset将失效 * @return */ private static String getFileSegment(File file, long offset, int length) { if (file == null || !file.exists()) return null; try { Charset charset = Charset.forName("UTF-8"); CharsetDecoder decoder = charset.newDecoder(); StringBuilder builder = new StringBuilder(); RandomAccessFile aFile = new RandomAccessFile(file, "r"); FileChannel inChannel = aFile.getChannel(); if (inChannel != null) { if (Integer.MAX_VALUE == length) length = (int) inChannel.size(); ByteBuffer buf = ByteBuffer.allocate(Math.abs(length)); if (length < 0) offset = inChannel.size() + length; int size = Math.abs(length); inChannel.position(offset < 0 ? 0 : offset); int bytesRead = inChannel.read(buf); while (bytesRead != -1 && size > 0) { buf.flip(); CharBuffer charBuffer = decoder.decode(buf); builder.append(charBuffer); buf.clear(); bytesRead = inChannel.read(buf); size = size - bytesRead; } inChannel.close(); } aFile.close(); return builder.toString(); } catch (Exception ex) { ex.printStackTrace(); } return null; } /** * 测试 * @param args * @throws UnsupportedEncodingException */ public static void main(String[] args) throws UnsupportedEncodingException { ExcelRowColumnInfo result; result = getRowAndColumnInfo("D:\元数据求解.xls", false); System.out.println(result); } }
用到的几个工具类:
/** * 文件复制 * @param srcFilePath * @param destFilePath * @return */ public static String copyFile(String srcFilePath, String destFilePath){ if (StringUtils.isEmpty(srcFilePath) || StringUtils.isEmpty(destFilePath)){ return null; } File srcFile = new File(srcFilePath); File destFile = new File(destFilePath); if (!srcFile.exists() || srcFile.isDirectory()){ return null; } try { if (!destFile.exists()) { destFile.createNewFile(); } FileUtils.copyFile(srcFile, destFile); return destFilePath; } catch (IOException e){ e.printStackTrace(); } return null; } /** * 对.zip文件进行解压缩 * * @param zipFile 解压缩文件 * @param descDir 解压缩的目标地址,如:D:\测试 或 /mnt/d/测试 * @return */ @SuppressWarnings("rawtypes") public static List<File> upzipFile(File zipFile, String descDir) { List<File> _list = new ArrayList<File>(); try { ZipFile _zipFile = new ZipFile(zipFile, "GBK"); for (Enumeration entries = _zipFile.getEntries(); entries.hasMoreElements(); ) { ZipEntry entry = (ZipEntry) entries.nextElement(); File _file = new File(descDir + File.separator + entry.getName()); if (entry.isDirectory()) { _file.mkdirs(); } else { File _parent = _file.getParentFile(); if (!_parent.exists()) { _parent.mkdirs(); } InputStream _in = _zipFile.getInputStream(entry); OutputStream _out = new FileOutputStream(_file); int len = 0; while ((len = _in.read(_byte)) > 0) { _out.write(_byte, 0, len); } _in.close(); _out.flush(); _out.close(); _list.add(_file); } } _zipFile.close(); } catch (IOException e) { } return _list; }
其中zip用的是
org.apache.tools.zip.ZipEntry;