最近项目需要解析pdf单据,获取里面的字段数据,通过网上的查阅发现itext比pdfbox的文档要多一点,所以选择了itext(不是说pdfbox不好,只是api和例子太少,难以解)。因pdf非模板化(某政府发放),所以靠表单域获取变得不现实。一开始通过PdfReaderContentParser获取的文档内容,但是获取到的是所有内容拼接成的一个字符串,而需求需要将数据精确到字段,靠截取字符串来达到解析的目的是行不通的,因为获取的内容毫无规律。后查看源码代码,发现解析过程是逐字随机字符解析的,所以只有通过字段所在坐标范围来获取字段内容。
/** * Created by luon 2018/3/21. */ @Override protected ModelAndView onSubmit(HttpServletRequest request, HttpServletResponse response, Object command, BindException errors) throws Exception { FileUploadForm form = (FileUploadForm) command; SimpleResult result = SimpleResult.create(false); if (form.getFile() == null || form.getFile().getSize() == 0) { result.setMessage("请上传pdf文件"); return new ModelAndView(new JsonView(result)); } if (!"pdf".equals(FilenameUtils.getExtension(form.getFile().getOriginalFilename().toLowerCase()))) { result.setMessage("请上传pdf格式的文件"); return new ModelAndView(new JsonView(result)); } //获取pdf文件流 InputStream inputStream = form.getFile().getInputStream(); //获取pdf内容 List<List<Map<String, String>>> listAll = exportPdfList(inputStream); final String path = "/excel模板路径/xls/fillbls.xls"; Workbook workbook = ExcelLoader.loadXls(this.getClass().getResourceAsStream(path)); //遍历pdf 内容插入Excel; List<ExcelRow> sheet = new ArrayList<>(); for (int i = 0; i < listAll.size(); i++) { List<Map<String, String>> listdata = listAll.get(i); for (Map<String, String> map : listdata) { String orderNum = map.get("orderNum"); String trackNum = map.get("trackNum"); String serviceType = map.get("serviceType"); String actualWeight = map.get("actualWeight"); String actualWeightUnits = map.get("actualWeightUnits"); String ratedWeight = map.get("ratedWeight"); String ratedWeightUnits = map.get("ratedWeightUnits"); String amount = map.get("amount"); String chargeDesion1 = map.get("chargeDesion1"); String chargeDesionCash1 = map.get("chargeDesionCash1"); ExcelRow itemRow = new ExcelRow(); itemRow.add(orderNum); itemRow.add(trackNum); itemRow.add(serviceType); itemRow.add(actualWeight); itemRow.add(actualWeightUnits); itemRow.add(ratedWeight); itemRow.add(ratedWeightUnits); itemRow.add(amount); itemRow.add(chargeDesion1); itemRow.add(chargeDesionCash1); sheet.add(itemRow); } } ExcelWriter.write(workbook, sheet, 0, 1); InputStream outStream = ExcelWriter.close(workbook); String fileName = "美国境内账单表.xls"; fileName = java.net.URLEncoder.encode(fileName, "UTF-8"); return new ModelAndView(new DownloadView(outStream, fileName)); } //读取pdf内容 注意此方法没有贴上详细代码。 public List<List<Map<String, String>>> exportPdfList(InputStream inputStream) { List<List<Map<String, String>>> listAll = new ArrayList<>(); try { Map<String, byte[]> pdfData = LabelSpliter.byPageNum(inputStream); //分页取pdf List<Map.Entry<String, byte[]>> list = new ArrayList<>(pdfData.entrySet()); //排序从 第一页开始 Collections.sort(list, new Comparator<Map.Entry<String, byte[]>>() { public int compare(Map.Entry<String, byte[]> o1, Map.Entry<String, byte[]> o2) { return (new Integer(o1.getKey())).compareTo(new Integer(o2.getKey())); } }); for (Map.Entry<String, byte[]> entry : list) { System.out.println(entry.getKey() + "-------------------------------------" + entry.getValue()); byte[] pdfBypage = entry.getValue(); InputStream inputfjsb = new ByteArrayInputStream(pdfBypage); PDDocument document = PDDocument.load(inputfjsb); if (!document.isEncrypted()) { PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); PDFTextStripper tStripper = new PDFTextStripper(); String pdfFileInText = tStripper.getText(document); listMap.add(map); listAll.add(listMap); return listAll; } }
通过PdfReaderContentParser获取的文档内容,但是获取到的是所有内容拼接成的一个字符串,而需求需要将数据精确到字段,靠截取字符串来达到解析的目的是行不通的,因为获取的内容毫无规律。以上代码仅提供思路,谢谢!