• java基于dfa实现敏感词过滤


    java基于dfa实现敏感词过滤

    在实现文字过滤的算法中,DFA是比较好的实现算法。DFA 即 Deterministic Finite Automaton,也就是确定有穷自动机,它是是通过event和当前的state得到下一个state,即event+state=nextstate。

    DFA算法参考资料1
    DFA算法参考资料2

    • 准备词库
    台独
    

    略。。。。。
    可下载开源的词库 词库

    • 初始化敏感词库
    package cn.pconline.pcloud.admin.dfa;
    
    import java.io.BufferedReader;
    import java.io.FileInputStream;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.util.*;
    
    /**
     * @Description 初始化敏感词库,将敏感词加入到HashMap中,构建DFA算法模型
     * @Author jie.zhao
     * @Date 2020/3/24 19:11
     */
    public class SensitiveWordInit {
        private String ENCODING = "UTF8";
        private String filePath;
        public HashMap sensitiveWordMap;
    
        public SensitiveWordInit(String filePath) {
            this.filePath = filePath;
        }
    
        /**
         * 初始化词库
    	 * @return
         */
        public Map initKeyWord() {
            try {
                //读取敏感词库
                Set<String> keyWordSet = readSensitiveWordFile();
                //将敏感词库加入到HashMap中
                addSensitiveWordToHashMap(keyWordSet);
            } catch (Exception e) {
                e.printStackTrace();
            }
            return sensitiveWordMap;
        }
    
        /**
         * 读取敏感词库,将敏感词放入HashSet中,
         * 构建一个DFA算法模型
    	 * 中 = {
    	 *      isEnd = 0
    	 *      国 = {<br>
    	 *      	 isEnd = 1
    	 *           人 = {isEnd = 0
    	 *                民 = {isEnd = 1}
    	 *                }
    	 *           男  = {
    	 *           	   isEnd = 0
    	 *           		人 = {
    	 *           			 isEnd = 1
    	 *           			}
    	 *           	}
    	 *           }
    	 *      }
    	 *  五 = {
    	 *      isEnd = 0
    	 *      星 = {
    	 *      	isEnd = 0
    	 *      	红 = {
    	 *              isEnd = 0
    	 *              旗 = {
    	 *                   isEnd = 1
    	 *                  }
    	 *              }
    	 *      	}
    	 *      }
         * @param keyWordSet
         */
        private void addSensitiveWordToHashMap(Set<String> keyWordSet) {
            //初始化敏感词容器,减少扩容操作
            sensitiveWordMap = new HashMap(keyWordSet.size());
            String key;
            Map nowMap;
            Map<String, String> newWorMap;
            //迭代keyWordSet
            Iterator<String> iterator = keyWordSet.iterator();
            while (iterator.hasNext()) {
                key = iterator.next();    //关键字
                nowMap = sensitiveWordMap;
                for (int i = 0; i < key.length(); i++) {
                    char keyChar = key.charAt(i);
                    //转换成char型
                    Object wordMap = nowMap.get(keyChar);
                    if (wordMap != null) {
                        //如果存在该key,直接赋值
                        nowMap = (Map) wordMap;
                    } else {
                        //不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个
                        newWorMap = new HashMap<String, String>();
                        newWorMap.put("isEnd", "0");
                        //不是最后一个
                        nowMap.put(keyChar, newWorMap);
                        nowMap = newWorMap;
                    }
                    if (i == key.length() - 1) {
                        //最后一个
                        nowMap.put("isEnd", "1");
                    }
                }
            }
        }
    
        /**
         * 读取敏感词库中的内容,将内容添加到set集合中
         *
         * @return
         * @throws Exception
         */
        private Set<String> readSensitiveWordFile() throws Exception {
            Set<String> set;
            InputStream inputStream = new FileInputStream(filePath);
            InputStreamReader read = null;
            try {
                if (inputStream != null) {
                	//文件流是否存在
                    read = new InputStreamReader(inputStream, ENCODING);
                    set = new HashSet<>();
                    BufferedReader bufferedReader = new BufferedReader(read);
                    String txt;
                    while ((txt = bufferedReader.readLine()) != null) {
                    	//读取文件,将文件内容放入到set中
                        set.add(txt);
                    }
                } else {
                    throw new Exception("敏感词库文件不存在");
                }
            } catch (Exception e) {
                throw e;
            } finally {
            	if(read!=null){
    				//关闭文件流
    				read.close();
    			}
            }
            return set;
        }
    }
    
    
    
    • 敏感词过滤
    package cn.pconline.pcloud.admin.dfa;
    
    
    import org.springframework.core.io.ClassPathResource;
    import org.springframework.core.io.Resource;
    
    import java.io.File;
    import java.io.IOException;
    import java.util.HashSet;
    import java.util.Iterator;
    import java.util.Map;
    import java.util.Set;
    
    /**
     * @Description Java实现敏感词过滤:http://blog.csdn.net/chenssy/article/details/26961957
     * @Author jie.zhao
     * @Date 2020/3/24 19:11
     */
    public class SensitivewordFilter {
        private Map sensitiveWordMap;
        public static int minMatchTYpe = 1;      //最小匹配规则
        public static int maxMatchType = 2;      //最大匹配规则
        private static SensitivewordFilter instance = new SensitivewordFilter();
    
        /**
         * 私有化构造方法
         *  Words.txt词库文件在resource下
         */
        private SensitivewordFilter() {
            try {
                Resource resource = new ClassPathResource("Words.txt");
                File file = resource.getFile();
                sensitiveWordMap = new SensitiveWordInit(file.getAbsolutePath()).initKeyWord();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    
        /**
         * 单例模式只初始化一次
         * @return
         */
        public static SensitivewordFilter getInstance() {
            return instance;
        }
    
        /**
         * 判断文字是否包含敏感字符
         *
         * @param txt       文字
         * @param matchType 匹配规则&nbsp;1:最小匹配规则,2:最大匹配规则
         * @return 若包含返回true,否则返回false
         */
        public boolean isContaintSensitiveWord(String txt, int matchType) {
            boolean flag = false;
            for (int i = 0; i < txt.length(); i++) {
                //判断是否包含敏感字符
                int matchFlag = this.CheckSensitiveWord(txt, i, matchType);
                if (matchFlag > 0) {
                    //大于0存在,返回true
                    flag = true;
                }
            }
            return flag;
        }
    
        /**
         * 获取文字中的敏感词
         *
         * @param txt       文字
         * @param matchType 匹配规则&nbsp;1:最小匹配规则,2:最大匹配规则
         *                  比如说我是中国人,铭感词库里面有“中国”、“中国人”,最小规则就变成了:我是**人、最大规则就是:我是***。
         *                  所以最小规则就是:找到敏感词就结束,最大规则就是:找到最底层的那个敏感词。两个深度不一样!
         * @return
         */
        public Set<String> getSensitiveWord(String txt, int matchType) {
            Set<String> sensitiveWordList = new HashSet<>();
            for (int i = 0; i < txt.length(); i++) {
                //判断是否包含敏感字符
                int length = CheckSensitiveWord(txt, i, matchType);
                if (length > 0) {
                    //存在,加入list中
                    sensitiveWordList.add(txt.substring(i, i + length));
                    //减1的原因,是因为for会自增
                    i = i + length - 1;
                }
            }
            return sensitiveWordList;
        }
    
        /**
         * 替换敏感字字符
         *
         * @param txt
         * @param matchType
         * @param replaceChar 替换字符,默认*
         */
        public String replaceSensitiveWord(String txt, int matchType, String replaceChar) {
            String resultTxt = txt;
            //获取所有的敏感词
            Set<String> set = getSensitiveWord(txt, matchType);
            Iterator<String> iterator = set.iterator();
            String word;
            String replaceString;
            while (iterator.hasNext()) {
                word = iterator.next();
                replaceString = getReplaceChars(replaceChar, word.length());
                resultTxt = resultTxt.replaceAll(word, replaceString);
            }
    
            return resultTxt;
        }
    
        /**
         * 获取替换字符串
         *
         * @param replaceChar
         * @param length
         * @return
         */
        private String getReplaceChars(String replaceChar, int length) {
            String resultReplace = replaceChar;
            for (int i = 1; i < length; i++) {
                resultReplace += replaceChar;
            }
            return resultReplace;
        }
    
        /**
         * 检查文字中是否包含敏感字符,检查规则如下:<br>
         *
         * @param txt
         * @param beginIndex
         * @param matchType
         * @return,如果存在,则返回敏感词字符的长度,不存在返回0
         */
        @SuppressWarnings({"rawtypes"})
        public int CheckSensitiveWord(String txt, int beginIndex, int matchType) {
            //敏感词结束标识位:用于敏感词只有1位的情况
            boolean flag = false;
            //匹配标识数默认为0
            int matchFlag = 0;
            char word = 0;
            Map nowMap = sensitiveWordMap;
            for (int i = beginIndex; i < txt.length(); i++) {
                word = txt.charAt(i);
                //获取指定key
                nowMap = (Map) nowMap.get(word);
                if (nowMap != null) {
                    //存在,则判断是否为最后一个
                    matchFlag++;
                    //找到相应key,匹配标识+1
                    if ("1".equals(nowMap.get("isEnd"))) {
                        //如果为最后一个匹配规则,结束循环,返回匹配标识数
                        flag = true;
                        //结束标志位为true
                        if (SensitivewordFilter.minMatchTYpe == matchType) {
                            //最小规则,直接返回,最大规则还需继续查找
                            break;
                        }
                    }
                } else {
                    //不存在,直接返回
                    break;
                }
            }
            if (matchFlag < 2 || !flag) {
                //长度必须大于等于1,为词
                matchFlag = 0;
            }
            return matchFlag;
        }
    
        public static void main(String[] args) {
            long beginTime = System.currentTimeMillis();
            SensitivewordFilter filter = SensitivewordFilter.getInstance();
            System.out.println("敏感词的数量:" + filter.sensitiveWordMap.size());
            String string = "台独积极分子";
    
            System.out.println("被检测字符串长度:" + string.length());
            String newStr = filter.replaceSensitiveWord(string, 2, "*");
            long endTime = System.currentTimeMillis();
            Set<String> set = filter.getSensitiveWord(string, 1);
            System.out.println("语句中包含敏感词的个数为:" + set.size() + "。包含:" + set);
            System.out.println("总共耗时:" + (endTime - beginTime) + "ms");
            System.out.println("替换后的字符串为:
    " + newStr);
        }
    }
    
    
    
    • 运行结果
    敏感词的数量:1894
    被检测字符串长度:6
    语句中包含敏感词的个数为:1。包含:[台独]
    总共耗时:149ms
    替换后的字符串为:
    **积极分子
    
  • 相关阅读:
    Codeforces Round #297 (Div. 2) 525C Ilya and Sticks(脑洞)
    全栈必备 JavaScript基础
    2014-04-19编程之美初赛题目及答案解析
    doT js模板入门 2
    一篇关于arc下内存管理的老文章,包含各种冷门修饰符(关于内存),写的较好,mark
    MyBatis官方教程及源代码解析——mapper映射文件
    Android中图片的三级缓存策略
    python字符串/元组/列表/字典互转
    关于字节对齐的理解
    阿里云服务器ecs配置之安装redis服务
  • 原文地址:https://www.cnblogs.com/cnsyear/p/12635401.html
Copyright © 2020-2023  润新知