未登录词识别:不在词典中的词
---新词:杀马特
---命名实体:奥克兰
主要解决方案:基于规则合词,然后通过百度验证。
Start Char Char 1-2-Combine #[图 n][里 f][市场 n][站 n]
Start Char Char Char 1-3-Combine #
Start Char Char Char Char 1-4-Combine #
Start Char Char Char Char Char 1-5-Combine #
Start Char Char Char Char Char Char 1-6-Combine #
Start Direction Char 1-2-Combine #东澳站 南势站
Start Char Word 1-2-Combine #[台 j][中港 nz][站 n]
Word Char Keyword 0-1-Combine #[梨园 nz][寮 g][站 v][白沙 nz][屯 ng][站 n]
Char Char Keyword 0-1-Combine #[商水县 ns][黄 a][寨 ng][站 n]
NumPrefix Num 0-1-Seq #地五医院
Num NumSuffix 0-1-Seq #93/号/酒家
Num Num 0-1-Combine #
Num Num Num 0-2-Combine #
Num Num Num Num 0-3-Combine #
Num Num Num Num Num 0-4-Combine #
Num Num Num Num Num Num 0-5-Combine #
Num Num Num Num Num Num Num 0-6-Combine #
Num Num Num Num Num Num Num Num 0-7-Combine #
Num Num Num Num Num Num Num Num Num 0-8-Combine #
Num Num Num Num Num Num Num Num Num Num 0-9-Combine #
Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter 0-10-Combine #
Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter 0-9-Combine #
Letter Letter Letter Letter Letter Letter Letter Letter Letter 0-8-Combine #
Letter Letter Letter Letter Letter Letter Letter Letter 0-7-Combine #
Letter Letter Letter Letter Letter Letter Letter 0-6-Combine #
Letter Letter Letter Letter Letter Letter 0-5-Combine #
Letter Letter Letter Letter Letter 0-4-Combine #
Letter Letter Letter Letter 0-3-Combine #
Letter Letter Letter 0-2-Combine #
Letter Letter 0-1-Combine #
Num NumSuffix Keyword 0-1-Seq #海口1号场BLACKSTONE球场
Num Char Char Keyword 0-2-Combine #八里岔中学
Char Num Char Keyword 0-2-Combine #八里岔中学
Char Char Num Keyword 0-2-Combine #八里岔中学
ackage cn.tianditu.mt.common;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class Grammar {
protected static Log logger = LogFactory.getLog(Grammar.class);
public final class TSTNode {
public CombinRule data = null;
protected TSTNode loNode;
protected TSTNode eqNode;
protected TSTNode hiNode;
protected SegMarkType splitchar;
public TSTNode(SegMarkType type) {
this.splitchar = type;
}
}
public TSTNode rootNode;
public TSTNode add(List<SegMarkType> word) {
if (null == word) {
throw new NullPointerException("空指针异常");
}
int charIndex = 0;
if (null == rootNode) {
rootNode = new TSTNode(word.get(0));
}
TSTNode currentNode = rootNode;
while (true) {
int charComp = word.get(charIndex).compareTo(currentNode.splitchar);
if (charComp == 0) {
charIndex++;
if (charIndex == word.size()) {
return currentNode;
}
if (null == currentNode.eqNode) {
currentNode.eqNode = new TSTNode(word.get(charIndex));
}
currentNode = currentNode.eqNode;
} else if (charComp < 0) {
if (null == currentNode.loNode) {
currentNode.loNode = new TSTNode(word.get(charIndex));
}
currentNode = currentNode.loNode;
} else {
if (null == currentNode.hiNode) {
currentNode.hiNode = new TSTNode(word.get(charIndex));
}
currentNode = currentNode.hiNode;
}
}
}
protected TSTNode getNode(List<SegMarkType> word) {
if (null == word) {
return null;
}
int len = word.size();
if (len == 0)
return null;
TSTNode currentNode = rootNode; // 匹配过程中的当前节点的位置
int charIndex = 0; // 表示当前要比较的字符在Key中的位置
SegMarkType cmpChar = word.get(charIndex);
int charComp;
while (true) {
if (currentNode == null) {// 没找到
return null;
}
charComp = cmpChar.compareTo(currentNode.splitchar);
if (charComp == 0) {// 相等往下走
charIndex++;
if (charIndex == len) {// 找到了
return currentNode;
} else {
cmpChar = word.get(charIndex);// 词往下走
}
currentNode = currentNode.eqNode;
} else if (charComp < 0) {// 小于往左走
currentNode = currentNode.loNode;
} else {// 大于往右走
currentNode = currentNode.hiNode;
}
}
}
public MatchRet matchLong(List<WordInfo> tokens, int offset) {
if (tokens == null || rootNode == null) {
return null;
}
MatchRet ret = null;
TSTNode currentNode = rootNode;
int index = offset;
while (currentNode != null) {
int charComp = tokens.get(index).getType().compareTo(
currentNode.splitchar);
if (charComp == 0) {
index++;
if (currentNode.data != null) {
ret = new MatchRet(currentNode, index);
}
if (index == tokens.size()) {
return ret;
}
currentNode = currentNode.eqNode;
} else if (charComp < 0) {
currentNode = currentNode.loNode;
} else {
currentNode = currentNode.hiNode;
}
}
return ret;
}
/**
* 根据语法规则进行合并
* 支持多次合并
* 且保留了源序列
* @param tokens
* @param rules
* @return
*/
private List<WordInfo> combineByRules(List<WordInfo> tokens,List<Combin> rules){
if(rules==null){
return tokens;
}
List<WordInfo> list=new ArrayList<WordInfo>();
for (int i = 0; i < tokens.size();) {
for (Combin com : rules) {
if(i==com.getStart()){
int start=com.getStart();
int end=com.getEnd();
List<WordInfo> sub=tokens.subList(start, end+1);//前闭后开
StringBuilder buff=new StringBuilder();
for (WordInfo wordInfo : sub) {
buff.append(wordInfo.getCn());
}
String cn=buff.toString();
SegMarkType type=com.getType();
WordInfo info=new WordInfo(cn,null,type,sub);
list.add(info);
i=end+1;
continue;
}
}
list.add(tokens.get(i));
i++;
}
return list;
}
/**
* 仅支持一次合并,不支持内部的多次合并,即无法达到有限状态机的效果
* @param tokens
* @param rules
*/
@SuppressWarnings("unused")
private void CombineOnce(LinkedList<WordInfo> tokens,
List<Combin> rules) {
for (Combin com : rules) {
int start = com.getStart();
int end = com.getEnd();
SegMarkType type = com.getType();
StringBuilder buff=new StringBuilder();
for (int i = start; i <= end; i++) {
WordInfo word=tokens.get(i);
buff.append(word.getCn());
}
int dis=end-start+1;
for (int i = 0; i < dis; i++) {
tokens.remove(start);
}
String cn=buff.toString();
WordInfo info=new WordInfo(cn,null,type);
tokens.add(start, info);
}
}
public List<WordInfo> tag(List<WordInfo> tokens) {
if (tokens == null || rootNode == null) {
return null;
}
List<Combin> rules = new ArrayList<Combin>();
for (int i = 0; i < tokens.size();) {
MatchRet ret = matchLong(tokens, i);
if (null != ret) {
CombinRule rule = ret.getNode().data;//找到了树上的东西
int indexCurrent = ret.getIndex()-1;
List<Combin> list_com = rule.getPosition();
for (Combin com : list_com) {
int start = indexCurrent - rule.getLen() + 1
+ com.getStart();
int end = indexCurrent - rule.getLen() + 1 + com.getEnd();
Combin c = new Combin(start, end, com.getType());//拿到规则
rules.add(c);//放入规则列表
}
i = ret.getIndex();
} else {
i++;
}
}
List<WordInfo> words= combineByRules(tokens,rules);//根据规则合并
return words;
}
public Grammar(Config config){
loadGrammar(config.getBasicGramFileName());
loadGrammar(config.getGramFileName());
}
public void loadGrammar(String gramFileName){
try {
FileReader fileReader = new FileReader(gramFileName);
BufferedReader reader = new BufferedReader(fileReader);
String line;
try {
while ((line = reader.readLine()) != null) {
String[] arr=line.split(" ");
List<SegMarkType> seq=FormSeq(arr[0]);
CombinRule rule=FormRule(arr[1],seq.size());
TSTNode node = this.add(seq);
node.data=rule;
}
} catch (NullPointerException e) {
logger.info(e.getMessage());
logger.info(e.getStackTrace());
} catch (IllegalArgumentException e) {
logger.info(e.getMessage());
logger.info(e.getStackTrace());
} catch (IOException e) {
logger.info(e.getMessage());
logger.info(e.getStackTrace());
}
} catch (FileNotFoundException e) {
logger.info(e.getMessage());
logger.info(e.getStackTrace());
}
}
private CombinRule FormRule(String line,int size) {
List<Combin> rec = new ArrayList<Combin>();
String[] arr_1=line.split("#");
for (String str : arr_1) {
String[] arr_2=str.split("-");
int start = Integer.parseInt(arr_2[0]);
int end=Integer.parseInt(arr_2[1]);
SegMarkType type=Enum.valueOf(SegMarkType.class, arr_2[2].trim());
Combin pos = new Combin(start, end, type);
rec.add(pos);
}
CombinRule rule = new CombinRule(rec,size);
return rule;
}
private List<SegMarkType> FormSeq(String string) {
List<SegMarkType> list=new ArrayList<SegMarkType>();
String[] arr=string.split(" ");
for (String str : arr) {
SegMarkType type=Enum.valueOf(SegMarkType.class, str);
list.add(type);
}
return list;
}
}