不知道大家又没有遇到这样的问题,某些业务系统有导出数据功能,导出的数据都是存放在excel表格里面,需要批量转csv,
但是这样的文件不是标准的excel文档,本质是html文档
比如说,系统导出的文档是这样的
从这里我们可以看出来,感觉就是一个普通的excel文档,通过office也能正常打开,但是你通过编写代码批量转csv的时候,就出问题
我也是在无意中发现这不是标准的excle文档,我们通过文档编辑器打开试试
这明显就是html文件,只能怪这个业务系统的开发人员不够严谨了,现在需要我们来解决这样的问题
我们先在idea里面创建一个maven项目
package com.gong; import java.io.*; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Scanner; import org.apache.commons.lang.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * Jsoup解析html标签时类似于JQuery的一些符号 * * @author chixh * */ public class HtmlParser { protected List<List<String>> data = new LinkedList<List<String>>(); /** * 获取value值 * * @param e * @return */ public static String getValue(Element e) { return e.attr("value"); } /** * 获取 * <tr> * 和 * </tr> * 之间的文本 * * @param e * @return */ public static String getText(Element e) { return e.text(); } /** * 识别属性id的标签,一般一个html页面id唯一 * * @param body * @param id * @return */ public static Element getID(String body, String id) { Document doc = Jsoup.parse(body); // 所有#id的标签 Elements elements = doc.select("#" + id); // 返回第一个 return elements.first(); } /** * 识别属性class的标签 * * @param body * @param class * @return */ public static Elements getClassTag(String body, String classTag) { Document doc = Jsoup.parse(body); // 所有#id的标签 return doc.select("." + classTag); } /** * 获取tr标签元素组 * * @param e * @return */ public static Elements getTR(Element e) { return e.getElementsByTag("tr"); } /** * 获取td标签元素组 * * @param e * @return */ public static Elements getTD(Element e) { return e.getElementsByTag("td"); } /** * 获取表元组 * @param table * @return */ public static List<List<String>> getTables(Element table){ List<List<String>> data = new ArrayList<>(); for (Element etr : table.select("tr")) { List<String> listh=new ArrayList<>(); //获取表头 for(Element eth : etr.select("th")){ String th=eth.text(); listh.add(th); } if(!listh.isEmpty()) { data.add(listh); } List<String> list = new ArrayList<>(); for (Element etd : etr.select("td")) { String temp = etd.text(); //增加一行中的一列 list.add(temp); } //增加一行 if(!list.isEmpty()) { data.add(list); } } return data; } /** * 读html文件 * @param fileName * @return */ public static String readHtml(String fileName){ FileInputStream fis = null; StringBuffer sb = new StringBuffer(); try { fis = new FileInputStream(fileName); byte[] bytes = new byte[1024]; while (-1 != fis.read(bytes)) { sb.append(new String(bytes)); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { fis.close(); } catch (IOException e1) { e1.printStackTrace(); } } return sb.toString(); } public static void getFileName(String inputexecl,String outputcsv){ // Document doc2 = Jsoup.parse(readHtml("E:\datas\小组成员学习情况统计(11).xls")); String path = inputexecl; File f = new File(path); if (!f.exists()) { System.out.println(path + " not exists"); return; } File fa[] = f.listFiles();//获取该目录下所有文件和目录的绝对路径 for (int i = 0; i < fa.length; i++) { File fs = fa[i]; if (fs.isDirectory()) { System.out.println(fs.getName() + " [目录]"); } else{ String filepath= String.valueOf(fs); Document doc2 = Jsoup.parse(readHtml(filepath)); Element table = doc2.select("table").first(); //获取table表的内容,存放到List集合里面 List<List<String>> list = getTables(table); for (List<String> list2 : list) { for (String string : list2) { System.out.print(string+","); } System.out.println(); } String name= StringUtils.substringBeforeLast(fs.getName(),".");//获取文件名字部分 //String newFilePath="E:\datas\csv\小组成员学习.csv"; String newFilePath=outputcsv+name+".csv"; String savePath = newFilePath; File saveCSV = new File(savePath); String buffer=""; try { if(!saveCSV.exists()) saveCSV.createNewFile(); OutputStreamWriter write = new OutputStreamWriter(new FileOutputStream(saveCSV ),"UTF-8"); BufferedWriter writer = new BufferedWriter(write); for(int j=0;j<list.size();j++){ List<String> list1=new ArrayList<String>(); buffer=list.get(j).toString(); System.out.println(buffer); buffer = buffer.substring(1, buffer.lastIndexOf("]")).toString(); list1.add(buffer); writer.write(buffer); writer.newLine(); } writer.close(); } catch (IOException e) { e.printStackTrace(); } } } } public static void main(String[] args) { System.out.println("请输入Execl数据所在路径"); Scanner execl=new Scanner(System.in); String input=execl.nextLine(); //获取execl输入路径 System.out.println("请输入csv文件数据的输出路径"); Scanner csv=new Scanner(System.in); String output = csv.nextLine(); getFileName(input,output); } }
pom.xml
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.gong</groupId> <artifactId>csv</artifactId> <version>1.0-SNAPSHOT</version> <name>csv</name> <!-- FIXME change it to the project's website --> <url>http://www.example.com</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <maven.compiler.source>1.7</maven.compiler.source> <maven.compiler.target>1.7</maven.compiler.target> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.11</version> <scope>test</scope> </dependency> <dependency> <groupId>net.sf.opencsv</groupId> <artifactId>opencsv</artifactId> <version>2.1</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>ooxml-schemas</artifactId> <version>1.1</version> <type>pom</type> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.7</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>ooxml-schemas</artifactId> <version>1.1</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.7</version> </dependency> <dependency> <groupId>dom4j</groupId> <artifactId>dom4j</artifactId> <version>1.6.1</version> </dependency> <!-- https://mvnrepository.com/artifact/net.sourceforge.jexcelapi/jxl --> <dependency> <groupId>net.sourceforge.jexcelapi</groupId> <artifactId>jxl</artifactId> <version>2.6.12</version> </dependency> <dependency> <groupId>commons-lang</groupId> <artifactId>commons-lang</artifactId> <version>2.6</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> </dependencies> <build> <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) --> <plugins> <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle --> <plugin> <artifactId>maven-clean-plugin</artifactId> <version>3.1.0</version> </plugin> <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging --> <plugin> <artifactId>maven-resources-plugin</artifactId> <version>3.0.2</version> </plugin> <plugin> <artifactId>maven-compiler-plugin</artifactId> <version>3.8.0</version> </plugin> <plugin> <artifactId>maven-surefire-plugin</artifactId> <version>2.22.1</version> </plugin> <plugin> <artifactId>maven-jar-plugin</artifactId> <version>3.0.2</version> </plugin> <plugin> <artifactId>maven-install-plugin</artifactId> <version>2.5.2</version> </plugin> <plugin> <artifactId>maven-deploy-plugin</artifactId> <version>2.8.2</version> </plugin> <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle --> <plugin> <artifactId>maven-site-plugin</artifactId> <version>3.7.1</version> </plugin> <plugin> <artifactId>maven-project-info-reports-plugin</artifactId> <version>3.0.0</version> </plugin> </plugins> </pluginManagement> </build> </project>
运行分别输入excel文档的目录和csv的输出目录就可以了,在这里提醒一下大家,如果使用我这段代码的话,excel文档的数据文件不能带有其他类型的文件。