从上篇 Java日期时间API系列39-----中文语句中的时间语义识别(time NLP 输入一句话,能识别出话里的时间)原理分析 中得知解析的主要步骤分为三步:
(1)加载正则文件
(2)解析中文语句中的所有时间词语
(3)根据基准时间,循环解析(2)中的时间词语。
下面结合代码分析一下。
1.加载正则文件
(1)正则文件介绍:
TimeRegex.Gzip(原项目中名称为TimeExp.m)是所有解析识别的基础。解压后查看可以看到文件内部为大量正则表达式,如部分截图如下:
(2)单例加载
public class TextAnalysis { private static volatile TextAnalysis instance; private static Pattern pattern; private boolean isPreferFuture; private TextAnalysis(){ try { pattern = RegexResourceUtil.readModel("TimeRegex.Gzip"); isPreferFuture = true; } catch (Exception e) { e.printStackTrace(); } } public static TextAnalysis getInstance(){ if(instance == null){ synchronized(TextAnalysis.class){ if(instance == null){ instance = new TextAnalysis(); } } } return instance; } } //RegexResourceUtil.readModel(String) /** * 获取Pattern * @param fileName 文件名称 * @return Pattern 正则对象 * @throws Exception 异常 */ public static Pattern readModel(String fileName) throws Exception { try(InputStream resourceAsStream = RegexResourceUtil.class.getClassLoader().getResourceAsStream(fileName)){ ObjectInputStream in = new ObjectInputStream( new BufferedInputStream(new GZIPInputStream((resourceAsStream)))); Pattern p = (Pattern) in.readObject(); return Pattern.compile(p.pattern()); } }
2.解析中文语句中的所有时间词语
/** * 根据正则集合识别出时间词语 * @param text 待处理文本 * @return 时间词语 */ public List<String> analysis(String text){ Matcher match; int startline = -1, endline = -1; List<String> tempResult = new ArrayList<>(); tempResult.add(""); int rpointer = 0;// 计数器,记录当前识别到哪一个字符串了 match = pattern.matcher(text); boolean startmark = true; while (match.find()) { startline = match.start(); if (endline == startline) // 假如下一个识别到的时间字段和上一个是相连的 @author kexm { rpointer--; tempResult.set(rpointer, tempResult.get(rpointer) + match.group());// 则把下一个识别到的时间字段加到上一个时间字段去 } else { if (!startmark) { rpointer--; rpointer++; } startmark = false; tempResult.set(rpointer, match.group());// 记录当前识别到的时间字段,并把startmark开关关闭。这个开关貌似没用? } endline = match.end(); rpointer++; if((tempResult.size()-1)<rpointer){ tempResult.add(""); } } if (rpointer > 0) { rpointer--; rpointer++; } return tempResult; }
3.根据基准时间,循环解析(2)中的时间词语。
/** * 时间表达式单元构造方法 * 该方法作为时间表达式单元的入口,将时间表达式字符串传入 * * @param timeExpression 时间表达式字符串 * @param textAnalysis 正则文件分析类 * @param timePoint 上下文时间 */ public TimeNLP(String timeExpression, TextAnalysis textAnalysis, TimeContext timePoint) { this.timeExpression = timeExpression; this.textAnalysis = textAnalysis; this.timeContextOrigin = timePoint; timeNormalization(); } /** * 时间表达式规范化的入口 * <p> * 时间表达式识别后,通过此入口进入规范化阶段, * 具体识别每个字段的值 */ private void timeNormalization() { //标准时间解析 LocalDateTime localDateTime = normStandardTime(); if(localDateTime == null){ normYear(); normMonth(); normDay(); normMonthFuzzyDay();/**add by kexm*/ normBaseRelated(); normBaseTimeRelated(); normCurRelated(); normHour(); normMinute(); normSecond(); normTotal(); modifyTimeBase(); localDateTime = LocalDateTime.of(1970, 1, 1, 0, 0); } String[] timeGrid = new String[6]; timeGrid = timeContextOrigin.getTimeBase().split("-"); int tunitpointer = 5; while (tunitpointer >= 0 && timeContext.getTunit()[tunitpointer] < 0) { tunitpointer--; } for (int i = 0; i < tunitpointer; i++) { if (timeContext.getTunit()[i] < 0) timeContext.getTunit()[i] = Integer.parseInt(timeGrid[i]); } String[] resultTmp = new String[6]; resultTmp[0] = String.valueOf(timeContext.getTunit()[0]); if (timeContext.getTunit()[0] >= 10 && timeContext.getTunit()[0] < 100) { resultTmp[0] = "19" + String.valueOf(timeContext.getTunit()[0]); } if (timeContext.getTunit()[0] > 0 && timeContext.getTunit()[0] < 10) { resultTmp[0] = "200" + String.valueOf(timeContext.getTunit()[0]); } for (int i = 1; i < 6; i++) { resultTmp[i] = String.valueOf(timeContext.getTunit()[i]); } if (Integer.parseInt(resultTmp[0]) != -1) { timeNorm += resultTmp[0] + "年"; localDateTime = localDateTime.withYear(Integer.valueOf(resultTmp[0])); if (Integer.parseInt(resultTmp[1]) != -1) { timeNorm += resultTmp[1] + "月"; localDateTime = localDateTime.withMonth(Integer.valueOf(resultTmp[1])); if (Integer.parseInt(resultTmp[2]) != -1) { timeNorm += resultTmp[2] + "日"; localDateTime = localDateTime.withDayOfMonth(Integer.valueOf(resultTmp[2])); if (Integer.parseInt(resultTmp[3]) != -1) { timeNorm += resultTmp[3] + "时"; localDateTime = localDateTime.withHour(Integer.valueOf(resultTmp[3])); if (Integer.parseInt(resultTmp[4]) != -1) { timeNorm += resultTmp[4] + "分"; localDateTime = localDateTime.withMinute(Integer.valueOf(resultTmp[4])); if (Integer.parseInt(resultTmp[5]) != -1) { timeNorm += resultTmp[5] + "秒"; localDateTime = localDateTime.withSecond(Integer.valueOf(resultTmp[5])); } } } } } } timeContextOrigin.setTunit(timeContext.getTunit().clone()); timeContext.setTimeBase(timeContextOrigin.getTimeBase()); timeContext.setOldTimeBase(timeContextOrigin.getOldTimeBase()); time = DateTimeConverterUtil.toDate(localDateTime); timeNormFormat = DateTimeFormatterUtil.format(localDateTime, DateTimeFormatterUtil.YYYY_MM_DD_HH_MM_SS_FMT); } //下面只举例 年的识别 /** * 年-规范化方法 * <p> * 该方法识别时间表达式单元的年字段 */ private void normYear() { /**假如只有两位数来表示年份*/ Pattern pattern = RegexEnum.NormYearTwo.getPattern(); Matcher match = pattern.matcher(timeExpression); if (match.find()) { timeContext.getTunit()[0] = Integer.parseInt(match.group()); if (timeContext.getTunit()[0] >= 0 && timeContext.getTunit()[0] < 100) { if (timeContext.getTunit()[0] < 30) /**30以下表示2000年以后的年份*/ timeContext.getTunit()[0] += 2000; else/**否则表示1900年以后的年份*/ timeContext.getTunit()[0] += 1900; } } /**不仅局限于支持1XXX年和2XXX年的识别,可识别三位数和四位数表示的年份*/ pattern = RegexEnum.NormYearFour.getPattern(); match = pattern.matcher(timeExpression); if (match.find())/**如果有3位数和4位数的年份,则覆盖原来2位数识别出的年份*/ { timeContext.getTunit()[0] = Integer.parseInt(match.group()); } }
timenlp相关代码仍有很多需要不断优化的地方,欢迎参与。