從上篇 Java日期時間API系列39-----中文語句中的時間語義識別(time NLP 輸入一句話,能識別出話裏的時間)原理分析 中得知解析的主要步驟分爲三步:
(1)加載正則文件
(2)解析中文語句中的所有時間詞語
(3)根據基準時間,循環解析(2)中的時間詞語。
下面結合代碼分析一下。
1.加載正則文件
(1)正則文件介紹:
TimeRegex.Gzip(原項目中名稱爲TimeExp.m)是所有解析識別的基礎。解壓後查看可以看到文件內部爲大量正則表達式,如部分截圖如下:
(2)單例加載
public class TextAnalysis { private static volatile TextAnalysis instance; private static Pattern pattern; private boolean isPreferFuture; private TextAnalysis(){ try { pattern = RegexResourceUtil.readModel("TimeRegex.Gzip"); isPreferFuture = true; } catch (Exception e) { e.printStackTrace(); } } public static TextAnalysis getInstance(){ if(instance == null){ synchronized(TextAnalysis.class){ if(instance == null){ instance = new TextAnalysis(); } } } return instance; } } //RegexResourceUtil.readModel(String) /** * 獲取Pattern * @param fileName 文件名稱 * @return Pattern 正則對象 * @throws Exception 異常 */ public static Pattern readModel(String fileName) throws Exception { try(InputStream resourceAsStream = RegexResourceUtil.class.getClassLoader().getResourceAsStream(fileName)){ ObjectInputStream in = new ObjectInputStream( new BufferedInputStream(new GZIPInputStream((resourceAsStream)))); Pattern p = (Pattern) in.readObject(); return Pattern.compile(p.pattern()); } }
2.解析中文語句中的所有時間詞語
/** * 根據正則集合識別出時間詞語 * @param text 待處理文本 * @return 時間詞語 */ public List<String> analysis(String text){ Matcher match; int startline = -1, endline = -1; List<String> tempResult = new ArrayList<>(); tempResult.add(""); int rpointer = 0;// 計數器,記錄當前識別到哪一個字符串了 match = pattern.matcher(text); boolean startmark = true; while (match.find()) { startline = match.start(); if (endline == startline) // 假如下一個識別到的時間字段和上一個是相連的 @author kexm { rpointer--; tempResult.set(rpointer, tempResult.get(rpointer) + match.group());// 則把下一個識別到的時間字段加到上一個時間字段去 } else { if (!startmark) { rpointer--; rpointer++; } startmark = false; tempResult.set(rpointer, match.group());// 記錄當前識別到的時間字段,並把startmark開關關閉。這個開關貌似沒用? } endline = match.end(); rpointer++; if((tempResult.size()-1)<rpointer){ tempResult.add(""); } } if (rpointer > 0) { rpointer--; rpointer++; } return tempResult; }
3.根據基準時間,循環解析(2)中的時間詞語。
/** * 時間表達式單元構造方法 * 該方法作爲時間表達式單元的入口,將時間表達式字符串傳入 * * @param timeExpression 時間表達式字符串 * @param textAnalysis 正則文件分析類 * @param timePoint 上下文時間 */ public TimeNLP(String timeExpression, TextAnalysis textAnalysis, TimeContext timePoint) { this.timeExpression = timeExpression; this.textAnalysis = textAnalysis; this.timeContextOrigin = timePoint; timeNormalization(); } /** * 時間表達式規範化的入口 * <p> * 時間表達式識別後,通過此入口進入規範化階段, * 具體識別每個字段的值 */ private void timeNormalization() { //標準時間解析 LocalDateTime localDateTime = normStandardTime(); if(localDateTime == null){ normYear(); normMonth(); normDay(); normMonthFuzzyDay();/**add by kexm*/ normBaseRelated(); normBaseTimeRelated(); normCurRelated(); normHour(); normMinute(); normSecond(); normTotal(); modifyTimeBase(); localDateTime = LocalDateTime.of(1970, 1, 1, 0, 0); } String[] timeGrid = new String[6]; timeGrid = timeContextOrigin.getTimeBase().split("-"); int tunitpointer = 5; while (tunitpointer >= 0 && timeContext.getTunit()[tunitpointer] < 0) { tunitpointer--; } for (int i = 0; i < tunitpointer; i++) { if (timeContext.getTunit()[i] < 0) timeContext.getTunit()[i] = Integer.parseInt(timeGrid[i]); } String[] resultTmp = new String[6]; resultTmp[0] = String.valueOf(timeContext.getTunit()[0]); if (timeContext.getTunit()[0] >= 10 && timeContext.getTunit()[0] < 100) { resultTmp[0] = "19" + String.valueOf(timeContext.getTunit()[0]); } if (timeContext.getTunit()[0] > 0 && timeContext.getTunit()[0] < 10) { resultTmp[0] = "200" + String.valueOf(timeContext.getTunit()[0]); } for (int i = 1; i < 6; i++) { resultTmp[i] = String.valueOf(timeContext.getTunit()[i]); } if (Integer.parseInt(resultTmp[0]) != -1) { timeNorm += resultTmp[0] + "年"; localDateTime = localDateTime.withYear(Integer.valueOf(resultTmp[0])); if (Integer.parseInt(resultTmp[1]) != -1) { timeNorm += resultTmp[1] + "月"; localDateTime = localDateTime.withMonth(Integer.valueOf(resultTmp[1])); if (Integer.parseInt(resultTmp[2]) != -1) { timeNorm += resultTmp[2] + "日"; localDateTime = localDateTime.withDayOfMonth(Integer.valueOf(resultTmp[2])); if (Integer.parseInt(resultTmp[3]) != -1) { timeNorm += resultTmp[3] + "時"; localDateTime = localDateTime.withHour(Integer.valueOf(resultTmp[3])); if (Integer.parseInt(resultTmp[4]) != -1) { timeNorm += resultTmp[4] + "分"; localDateTime = localDateTime.withMinute(Integer.valueOf(resultTmp[4])); if (Integer.parseInt(resultTmp[5]) != -1) { timeNorm += resultTmp[5] + "秒"; localDateTime = localDateTime.withSecond(Integer.valueOf(resultTmp[5])); } } } } } } timeContextOrigin.setTunit(timeContext.getTunit().clone()); timeContext.setTimeBase(timeContextOrigin.getTimeBase()); timeContext.setOldTimeBase(timeContextOrigin.getOldTimeBase()); time = DateTimeConverterUtil.toDate(localDateTime); timeNormFormat = DateTimeFormatterUtil.format(localDateTime, DateTimeFormatterUtil.YYYY_MM_DD_HH_MM_SS_FMT); } //下面只舉例 年的識別 /** * 年-規範化方法 * <p> * 該方法識別時間表達式單元的年字段 */ private void normYear() { /**假如只有兩位數來表示年份*/ Pattern pattern = RegexEnum.NormYearTwo.getPattern(); Matcher match = pattern.matcher(timeExpression); if (match.find()) { timeContext.getTunit()[0] = Integer.parseInt(match.group()); if (timeContext.getTunit()[0] >= 0 && timeContext.getTunit()[0] < 100) { if (timeContext.getTunit()[0] < 30) /**30以下表示2000年以後的年份*/ timeContext.getTunit()[0] += 2000; else/**否則表示1900年以後的年份*/ timeContext.getTunit()[0] += 1900; } } /**不僅侷限於支持1XXX年和2XXX年的識別,可識別三位數和四位數表示的年份*/ pattern = RegexEnum.NormYearFour.getPattern(); match = pattern.matcher(timeExpression); if (match.find())/**如果有3位數和4位數的年份,則覆蓋原來2位數識別出的年份*/ { timeContext.getTunit()[0] = Integer.parseInt(match.group()); } }