• 未登录词识别


    未登录词识别:不在词典中的词
    ---新词:杀马特
    ---命名实体:奥克兰


    主要解决方案:基于规则合词,然后通过百度验证。

    Start Char Char    1-2-Combine    #[图 n][里 f][市场 n][站 n]
    Start Char Char Char    1-3-Combine    #
    Start Char Char Char Char    1-4-Combine    #
    Start Char Char Char Char Char    1-5-Combine    #
    Start Char Char Char Char Char Char    1-6-Combine    #
    Start Direction Char    1-2-Combine    #东澳站 南势站
    Start Char Word    1-2-Combine    #[台 j][中港 nz][站 n]
    Word Char Keyword    0-1-Combine    #[梨园 nz][寮 g][站 v][白沙 nz][屯 ng][站 n]
    Char Char Keyword    0-1-Combine    #[商水县 ns][黄 a][寨 ng][站 n]
    NumPrefix Num    0-1-Seq    #地五医院
    Num NumSuffix    0-1-Seq    #93/号/酒家
    Num Num    0-1-Combine #
    Num Num Num    0-2-Combine #
    Num Num Num Num    0-3-Combine #
    Num Num Num Num Num    0-4-Combine #
    Num Num Num Num Num Num    0-5-Combine #
    Num Num Num Num Num Num Num    0-6-Combine #
    Num Num Num Num Num Num Num Num    0-7-Combine #
    Num Num Num Num Num Num Num Num Num    0-8-Combine #
    Num Num Num Num Num Num Num Num Num Num    0-9-Combine #
    Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter    0-10-Combine    #
    Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter    0-9-Combine    #
    Letter Letter Letter Letter Letter Letter Letter Letter Letter    0-8-Combine    #
    Letter Letter Letter Letter Letter Letter Letter Letter    0-7-Combine    #
    Letter Letter Letter Letter Letter Letter Letter    0-6-Combine    #
    Letter Letter Letter Letter Letter Letter    0-5-Combine    #
    Letter Letter Letter Letter Letter    0-4-Combine    #
    Letter Letter Letter Letter    0-3-Combine    #
    Letter Letter Letter    0-2-Combine    #
    Letter Letter    0-1-Combine    #
    Num NumSuffix Keyword    0-1-Seq    #海口1号场BLACKSTONE球场
    Num Char Char Keyword    0-2-Combine    #八里岔中学
    Char Num Char Keyword    0-2-Combine    #八里岔中学
    Char Char Num Keyword    0-2-Combine    #八里岔中学
    ackage cn.tianditu.mt.common;
    
    import java.io.BufferedReader;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.LinkedList;
    import java.util.List;
    
    import org.apache.commons.logging.Log;
    import org.apache.commons.logging.LogFactory;
    
    public class Grammar {
        
        protected static Log logger = LogFactory.getLog(Grammar.class);
        
        public final class TSTNode {
            public CombinRule data = null;
            protected TSTNode loNode;
            protected TSTNode eqNode;
            protected TSTNode hiNode;
            protected SegMarkType splitchar;
    
            public TSTNode(SegMarkType type) {
                this.splitchar = type;
            }
        }
    
        public TSTNode rootNode;
        
        public TSTNode add(List<SegMarkType> word) {
            if (null == word) {
                throw new NullPointerException("空指针异常");
            }
    
            int charIndex = 0;
            if (null == rootNode) {
                rootNode = new TSTNode(word.get(0));
            }
            TSTNode currentNode = rootNode;
            while (true) {
                int charComp = word.get(charIndex).compareTo(currentNode.splitchar);
                if (charComp == 0) {
                    charIndex++;
                    if (charIndex == word.size()) {
                        return currentNode;
                    }
                    if (null == currentNode.eqNode) {
                        currentNode.eqNode = new TSTNode(word.get(charIndex));
                    }
                    currentNode = currentNode.eqNode;
                } else if (charComp < 0) {
                    if (null == currentNode.loNode) {
                        currentNode.loNode = new TSTNode(word.get(charIndex));
                    }
                    currentNode = currentNode.loNode;
                } else {
                    if (null == currentNode.hiNode) {
                        currentNode.hiNode = new TSTNode(word.get(charIndex));
                    }
                    currentNode = currentNode.hiNode;
                }
            }
        }
    
        protected TSTNode getNode(List<SegMarkType> word) {
            if (null == word) {
                return null;
            }
            int len = word.size();
            if (len == 0)
                return null;
            TSTNode currentNode = rootNode; // 匹配过程中的当前节点的位置
            int charIndex = 0; // 表示当前要比较的字符在Key中的位置
            SegMarkType cmpChar = word.get(charIndex);
            int charComp;
            while (true) {
                if (currentNode == null) {// 没找到
                    return null;
                }
                charComp = cmpChar.compareTo(currentNode.splitchar);
                if (charComp == 0) {// 相等往下走
                    charIndex++;
                    if (charIndex == len) {// 找到了
                        return currentNode;
                    } else {
                        cmpChar = word.get(charIndex);// 词往下走
                    }
                    currentNode = currentNode.eqNode;
                } else if (charComp < 0) {// 小于往左走
                    currentNode = currentNode.loNode;
                } else {// 大于往右走
                    currentNode = currentNode.hiNode;
                }
            }
        }
    
        public MatchRet matchLong(List<WordInfo> tokens, int offset) {
            if (tokens == null || rootNode == null) {
                return null;
            }
    
            MatchRet ret = null;
            TSTNode currentNode = rootNode;
            int index = offset;
            while (currentNode != null) {
                int charComp = tokens.get(index).getType().compareTo(
                        currentNode.splitchar);
                if (charComp == 0) {
                    index++;
                    if (currentNode.data != null) {
                        ret = new MatchRet(currentNode, index);
                    }
                    if (index == tokens.size()) {
                        return ret;
                    }
                    currentNode = currentNode.eqNode;
                } else if (charComp < 0) {
                    currentNode = currentNode.loNode;
                } else {
                    currentNode = currentNode.hiNode;
                }
            }
            return ret;
        }
    
        /**
         * 根据语法规则进行合并
         * 支持多次合并
         * 且保留了源序列
         * @param tokens
         * @param rules
         * @return
         */
        private List<WordInfo> combineByRules(List<WordInfo> tokens,List<Combin> rules){
            if(rules==null){
                return tokens;
            }
            List<WordInfo> list=new ArrayList<WordInfo>();
            for (int i = 0; i < tokens.size();) {
                for (Combin com : rules) {
                    if(i==com.getStart()){
                        int start=com.getStart();
                        int end=com.getEnd();
                        
                        List<WordInfo> sub=tokens.subList(start, end+1);//前闭后开
                        StringBuilder buff=new StringBuilder();
                        for (WordInfo wordInfo : sub) {
                            buff.append(wordInfo.getCn());
                        }                    
                        String cn=buff.toString();
                        SegMarkType type=com.getType();                    
                        WordInfo info=new WordInfo(cn,null,type,sub);                
                        list.add(info);                    
                        i=end+1;                    
                        continue;
                    }        
                }
                list.add(tokens.get(i));
                i++;        
            }
            return list;
        }
        
        /**
         * 仅支持一次合并,不支持内部的多次合并,即无法达到有限状态机的效果
         * @param tokens
         * @param rules
         */
        @SuppressWarnings("unused")
        private void CombineOnce(LinkedList<WordInfo> tokens,
                List<Combin> rules) {
    
            for (Combin com : rules) {
                int start = com.getStart();
                int end = com.getEnd();
                SegMarkType type = com.getType();
                
                StringBuilder buff=new StringBuilder();
                for (int i = start; i <= end; i++) {
                    WordInfo word=tokens.get(i);
                    buff.append(word.getCn());
                }
                
                int dis=end-start+1;
                for (int i = 0; i < dis; i++) {
                    tokens.remove(start);
                }
                
                String cn=buff.toString();
                WordInfo info=new WordInfo(cn,null,type);
                tokens.add(start, info);            
            }
        }
    
        public List<WordInfo> tag(List<WordInfo> tokens) {
            if (tokens == null || rootNode == null) {
                return null;
            }
            List<Combin> rules = new ArrayList<Combin>();
            for (int i = 0; i < tokens.size();) {
                MatchRet ret = matchLong(tokens, i);
                if (null != ret) {
                    CombinRule rule = ret.getNode().data;//找到了树上的东西
                    int indexCurrent = ret.getIndex()-1;
                    List<Combin> list_com = rule.getPosition();
                    for (Combin com : list_com) {
                        int start = indexCurrent - rule.getLen() + 1
                                + com.getStart();
                        int end = indexCurrent - rule.getLen() + 1 + com.getEnd();
                        Combin c = new Combin(start, end, com.getType());//拿到规则
                        rules.add(c);//放入规则列表
                    }
                    i = ret.getIndex();
                } else {
                    i++;
                }
            }
            List<WordInfo> words= combineByRules(tokens,rules);//根据规则合并
            return words;
        }
    
        public Grammar(Config config){
            loadGrammar(config.getBasicGramFileName());
            loadGrammar(config.getGramFileName());
        }
        
        
        public void loadGrammar(String gramFileName){
            try {
                FileReader fileReader = new FileReader(gramFileName);
                BufferedReader reader = new BufferedReader(fileReader);
                String line;
                try {
                    while ((line = reader.readLine()) != null) {
                        String[] arr=line.split("	");
                        
                        List<SegMarkType> seq=FormSeq(arr[0]);
                        CombinRule rule=FormRule(arr[1],seq.size());                    
                        TSTNode node = this.add(seq);
                        node.data=rule;
                    }
                } catch (NullPointerException e) {
                    logger.info(e.getMessage());
                    logger.info(e.getStackTrace());
                } catch (IllegalArgumentException e) {
                    logger.info(e.getMessage());
                    logger.info(e.getStackTrace());
                } catch (IOException e) {
                    logger.info(e.getMessage());
                    logger.info(e.getStackTrace());
                }
            } catch (FileNotFoundException e) {
                logger.info(e.getMessage());
                logger.info(e.getStackTrace());
            }
        }
        
        
        
        
        private CombinRule FormRule(String line,int size) {
            
            List<Combin> rec = new ArrayList<Combin>();
            String[] arr_1=line.split("#");
            for (String str : arr_1) {
                String[] arr_2=str.split("-");
                int start = Integer.parseInt(arr_2[0]);
                int end=Integer.parseInt(arr_2[1]);
                SegMarkType type=Enum.valueOf(SegMarkType.class, arr_2[2].trim());
                Combin pos = new Combin(start, end, type);
                rec.add(pos);
            }        
            CombinRule rule = new CombinRule(rec,size);
            return rule;
        }
    
        private List<SegMarkType> FormSeq(String string) {
            List<SegMarkType> list=new ArrayList<SegMarkType>();
            String[] arr=string.split(" ");
            for (String str : arr) {
                SegMarkType type=Enum.valueOf(SegMarkType.class, str);
                list.add(type);
            }
            return list;
        }
        
    }
  • 相关阅读:
    学生管理系统
    Selenium元素定位的30种方式
    python-- 多进程
    python 多线程的实现
    python 节省内存的for循环技巧
    python 生成器
    python 字符串编码检测
    opencv-python 图片的几何变换
    opencv-python --图像处理
    目标检测
  • 原文地址:https://www.cnblogs.com/i80386/p/3965091.html
Copyright © 2020-2023  润新知