下载地址:https://files.cnblogs.com/files/heyang78/JsonAnalyzer20200518-01.zip
测试用例:https://www.cnblogs.com/heyang78/p/12911174.html
为什么创建此工程?
笔者在开发中曾遇到一个Restful接口变更后,将新Json文本和旧有文档的对比矫正工作,当时最大的问题是两边都不是按字母序排列的,比较时只能上下翻找而不可一一比对,于是我产生了将Json文本同级节点按字母序排列的想法,继而将其实现之.https://www.cnblogs.com/heyang78/p/11973129.html 是第一个版本,当时实现了需求但未完善,今天(2020年5月18日)修正了原有数组解析不完全的bug并调整简化了代码.
解析效果展示:
原有文本:
{"data":[{"deliveryListId":"20180001","shipperCode":"0030","shortShipperName":"RB","orderNo":"102018032001","deliveryOrder":1,"receiverName":"吉田XXX","receiverTelNo":"07012340303","receiverAddress1":"東京都足立区足立1-1","receiverAddress2":"東京都足立区足立1-2","isCod":true,"billAmount":5,"geocodingScore":50,"latitudeJP":"56789.33","longitudeJP":"123456.33","latitude":"20180001.22","longitude":"20180001.33","vehicleId":"239","orderDetails":[{"trackingNo":"201803200001","quantity":1,"lapCount":null,"statusCode":null,"statusNameMobile":null},{"trackingNo":"201803200002","quantity":1,"lapCount":4,"statusCode":"100","statusNameMobile":"配送準備中"},{"trackingNo":"201803200003","quantity":1,"lapCount":4,"statusCode":"300","statusNameMobile":"持出し"},{"trackingNo":"201803200004","quantity":1,"lapCount":4,"statusCode":"100","statusNameMobile":"配送準備中"},{"trackingNo":"201803200005","quantity":1,"lapCount":4,"statusCode":"100","statusNameMobile":"配送準備中"}]}]}
解析后文本:
{ "data":[ { "billAmount":5, "deliveryListId":"20180001", "deliveryOrder":1, "geocodingScore":50, "isCod":true, "latitude":"20180001.22", "latitudeJP":"56789.33", "longitude":"20180001.33", "longitudeJP":"123456.33", "orderDetails":[ { "lapCount":null, "quantity":1, "statusCode":null, "statusNameMobile":null, "trackingNo":"201803200001" }, { "lapCount":4, "quantity":1, "statusCode":"100", "statusNameMobile":"配送準備中", "trackingNo":"201803200002" }, { "lapCount":4, "quantity":1, "statusCode":"300", "statusNameMobile":"持出し", "trackingNo":"201803200003" }, { "lapCount":4, "quantity":1, "statusCode":"100", "statusNameMobile":"配送準備中", "trackingNo":"201803200004" }, { "lapCount":4, "quantity":1, "statusCode":"100", "statusNameMobile":"配送準備中", "trackingNo":"201803200005" } ], "orderNo":"102018032001", "receiverAddress1":"東京都足立区足立1-1", "receiverAddress2":"東京都足立区足立1-2", "receiverName":"吉田XXX", "receiverTelNo":"07012340303", "shipperCode":"0030", "shortShipperName":"RB", "vehicleId":"239" } ] }
此工程的扩展意义:
做编译的分词,语法分析,构建语法树在此工程中均有体现,此工程也为后继编译项目打下了基础.
核心类说明:
记号类,此类用于给Json文本中出现的七种文本分类:
package com.heyang; /** * Tokens in json format * @author Heyang * */ public class Token { public static final int TYPE_OPEN_BRACE=0; // { public static final int TYPE_CLOSE_BRACE=1; // } public static final int TYPE_TEXT=2; // text public static final int TYPE_COMMA=3; // , public static final int TYPE_COLON=4; // : public static final int TYPE_OPEN_BRACKET=5; // [ public static final int TYPE_CLOSE_BRACKET=6; // ] private int type; private String text; public Token(char c,int type) { this.text=String.valueOf(c); this.type=type; } public Token(String word,int type) { this.text=word; this.type=type; } public int getType() { return type; } public void setType(int type) { this.type = type; } public String getText() { return text; } public void setText(String text) { this.text = text; } }
分词器类,此类用于将json变成记号:
package com.heyang; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang.StringUtils; /** * Parse json string to tokens * @author Heyang * */ public class Lexer { private List<Token> tokens; public Lexer(String jsonTxt) { tokens = new ArrayList<Token>(); String bundle = ""; for (int i = 0; i < jsonTxt.length(); i++) { char c = jsonTxt.charAt(i); if (Character.isWhitespace(c)) { continue; } else if (c == '{') { tokens.add(new Token(c, Token.TYPE_OPEN_BRACE)); } else if (c == '}') { if (StringUtils.isNotEmpty(bundle)) { tokens.add(new Token(bundle, Token.TYPE_TEXT)); bundle = ""; } tokens.add(new Token(c, Token.TYPE_CLOSE_BRACE)); } else if (c == '[') { tokens.add(new Token(c, Token.TYPE_OPEN_BRACKET)); } else if (c == ']') { if (StringUtils.isNotEmpty(bundle)) { tokens.add(new Token(bundle, Token.TYPE_TEXT)); bundle = ""; } tokens.add( new Token(c, Token.TYPE_CLOSE_BRACKET)); } else if (c == ',') { if (StringUtils.isNotEmpty(bundle)) { tokens.add(new Token(bundle, Token.TYPE_TEXT)); bundle = ""; } tokens.add(new Token(c, Token.TYPE_COMMA)); } else if (c == ':') { if (StringUtils.isNotEmpty(bundle)) { tokens.add(new Token(bundle, Token.TYPE_TEXT)); bundle = ""; } tokens.add(new Token(c, Token.TYPE_COLON)); } else { bundle += c; } } } public List<Token> getTokenList() { return tokens; } // Just for test public void printTokens() { int idx = 0; for (Token t : tokens) { idx++; System.out.println("#" + idx + " " + t.getText()); } } public String getCompactJsonTxt() { StringBuilder sb=new StringBuilder(); for (Token t : tokens) { sb.append(t.getText()); } return sb.toString(); } }
节点类,这个类构成了语法树的节点:
package com.heyang; import java.util.Collections; import java.util.LinkedList; import java.util.List; /** * Json Node * @author Heyang * */ public class Node implements Comparable<Node>{ // There are value types public static final int Type_String=1; public static final int Type_Array=2; public static final int Type_List=3; // Key always is String private String key; private Node parent; // There are three types of value private int valueType; private String valueString; private List<Node> valueList; // indent depth private int depth; public Node() { } public Node(String key,String value) { this.key=key; this.valueType=Type_String; this.valueString=value; this.depth=0; } public Node(String key,int type) { this.key=key; this.valueType=type; this.valueList=new LinkedList<Node>(); } public void addChild(Node child) { if(valueList!=null) { valueList.add(child); child.parent=this; adjustDepth(); } } private void adjustDepth() { if(valueType==Type_List || valueType==Type_Array) { for(Node json:valueList) { json.depth=this.depth+1; json.adjustDepth(); } } } public String toString() { StringBuilder sb=new StringBuilder(); // key String tabs=getIndentSpace(); sb.append(tabs); if(key!=null) { sb.append(key); sb.append(":"); } // value if(valueType==Type_String) { sb.append(valueString); }else if(valueType==Type_Array) { sb.append("[ "); int n=valueList.size(); for(int i=0;i<n;i++) { Node json=valueList.get(i); if(i!=n-1) { sb.append(json.toString()+", "); }else { sb.append(json.toString()+" "); } } sb.append(tabs+"]"); }else if(valueType==Type_List) { sb.append("{ "); Collections.sort(valueList); int n=valueList.size(); for(int i=0;i<n;i++) { Node json=valueList.get(i); if(i!=n-1) { sb.append(json.toString()+", "); }else { sb.append(json.toString()+" "); } } sb.append(tabs+"}"); } return sb.toString(); } public int compareTo(Node other) { return this.key.compareTo(other.key); } private String getIndentSpace() { return String.join("", Collections.nCopies(this.depth, " ")); } public String getKey() { return key; } public void setKey(String key) { this.key = key; } public Node getParent() { return parent; } public void setParent(Node parent) { this.parent = parent; } public List<Node> getValueList() { return valueList; } }
节点树构建类,顾名思义,此类就是用类构建Node树的:
package com.heyang; import java.util.List; import java.util.Stack; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * JSOn tree builder * @author heyang * * 2020年5月18日 */ public class Builder { private Node root; private int index; private List<Token> tokens; public Builder(List<Token> tokens) { this.tokens=tokens; this.index=1; this.root=new Node(null,Node.Type_List); addSubNode2(this.root); } /** * Add branch/leaf to parent node * @param parent */ private void addSubNode2(Node parent) { if(parent==null) { return; } Stack<Token> stack=new Stack<Token>(); while(index<this.tokens.size()) { Token token=tokens.get(index); if(token.getType()==Token.TYPE_OPEN_BRACE) {// { Node newBraceNode=new Node(null,Node.Type_List); if(stack.size()>=2) { Token colonToken=stack.pop(); Token keyToken=stack.pop(); if(colonToken.getType()==Token.TYPE_COLON && keyToken.getType()==Token.TYPE_TEXT) { newBraceNode.setKey(keyToken.getText()); } } parent.addChild(newBraceNode); index++; addSubNode2(newBraceNode); }else if(token.getType()==Token.TYPE_CLOSE_BRACE) { // } String text=getTextInStack(stack); if(text.length()>0) { final String keyValuePattern="("([_a-zA-Z]+[_a-zA-Z0-9]*)")[:]([^,}]+)"; if(Pattern.matches(keyValuePattern,text)) { java.util.regex.Pattern pattern=Pattern.compile(keyValuePattern); Matcher matcher=pattern.matcher(text); while(matcher.find()) { Node txt=new Node(matcher.group(1),matcher.group(3)); parent.addChild(txt); } } } stack.clear(); index++; addSubNode2(parent.getParent()); }else if(token.getType()==Token.TYPE_OPEN_BRACKET) { // [ Node newBracketNode=new Node(null,Node.Type_Array); if(stack.size()>=2) { Token left1=stack.pop(); Token left2=stack.pop(); if(left1.getType()==Token.TYPE_COLON && left2.getType()==Token.TYPE_TEXT) { newBracketNode.setKey(left2.getText()); } } parent.addChild(newBracketNode); index++; addSubNode2(newBracketNode); }else if(token.getType()==Token.TYPE_CLOSE_BRACKET) { // ] String text=getTextInStack(stack); if(text.length()>0) { final String keyValuePattern="("([_a-zA-Z]+[_a-zA-Z0-9]*)")[:]([^,}]+)"; if(Pattern.matches(keyValuePattern,text)) { java.util.regex.Pattern pattern=Pattern.compile(keyValuePattern); Matcher matcher=pattern.matcher(text); while(matcher.find()) { Node txt=new Node(matcher.group(1),matcher.group(3)); parent.addChild(txt); } }else { java.util.regex.Pattern pattern=Pattern.compile("([^,]+)"); Matcher matcher=pattern.matcher(text); while(matcher.find()) { Node txt=new Node(null,matcher.group(1)); parent.addChild(txt); } } } stack.clear(); index++; addSubNode2(parent.getParent()); }else if(token.getType()==Token.TYPE_COMMA) { String text=getTextInStack(stack); if(text.length()>0) { final String keyValuePattern="("([_a-zA-Z]+[_a-zA-Z0-9]*)")[:]([^,}]+)"; if(Pattern.matches(keyValuePattern,text)) { java.util.regex.Pattern pattern=Pattern.compile(keyValuePattern); Matcher matcher=pattern.matcher(text); while(matcher.find()) { Node txt=new Node(matcher.group(1),matcher.group(3)); parent.addChild(txt); } }else { java.util.regex.Pattern pattern=Pattern.compile("([^,]+)"); Matcher matcher=pattern.matcher(text); while(matcher.find()) { Node txt=new Node(null,matcher.group(1)); parent.addChild(txt); } } } stack.clear(); index++; }else { stack.push(token); index++; } } } private String getTextInStack(Stack<Token> stack) { StringBuilder sb=new StringBuilder(); for(int i=0;i<stack.size();i++) { Token t=stack.elementAt(i); sb.append(t.getText()); } return sb.toString(); } public Node getRoot() { return root; } }
入口类,这个类将以上四个类串联起来调用:
package com.heyang; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.heyang.util.BracketsBalanceChecker; import com.heyang.util.CommonUtil; public class EntryPoint { private final static Logger log = LoggerFactory.getLogger(EntryPoint.class); public static void main(String[] args){ log.info("JsonAnalyzer started."); if(args.length<1) { log.error("Please set json file path in arguments."); } String filePath=args[0]; log.info("Begin to read file:'{}'",filePath); try { // Read context from file String jsonTxt=CommonUtil.readTextFromFile(filePath); log.info("Raw json text= {}",jsonTxt); // Check brackets balance BracketsBalanceChecker bbc=new BracketsBalanceChecker(); boolean balanced=bbc.isBracketsBalanced(jsonTxt); if(balanced) { log.info("The brackets in read content are balanced."); } // Parse json to tokens Lexer l=new Lexer(jsonTxt); log.info("Compact json text= {}",l.getCompactJsonTxt()); // Build json node tree Builder b=new Builder(l.getTokenList()); Node root=b.getRoot(); log.info("Formatted json= {}",root.toString()); }catch(Exception ex) { log.error(ex.getMessage()); }finally { log.info("JsonAnalyzer end."); } } }
最后的感悟:
我在开发生涯中不止一次的遇到复杂的文本解析任务,做文本解析,第一反应往往是<编译原理>的那些东西,但笔者不是计算机科班出身,把网上推荐的龙书虎书鲸书买来一看头都要炸了,开发更是在一番斗争后搁置或是简化了. 多次后我终于想到,做文本解析并非一定要先啃下那些大部头书,用递归向下一样能完成任务,适合我的才是最好的.我先用递归向下的方式解析文本,由易到难,再辅以看书,编译原理和任务就可以并行的.这比啃不动书而止步不前要强得多.编程就是这样,能动手才算真正掌握一门技能, 学东西就该学明白,钻透!纸上得来一知半解,最终还是要重新夯实!
--2020年5月18日--