• DFA算法实现敏感词过滤


    DFA算法:即确定有穷自动机,简单点说就是,它是是通过event和当前的state得到下一个state,即event+state=nextstate。理解为系统中有多个节点,通过传递进入的event,来确定走哪个路由至另一个节点,而节点是有限的。

    废话不多说,直接贴上代码:

    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.InputStreamReader;
    import java.util.HashMap;
    import java.util.HashSet;
    import java.util.Iterator;
    import java.util.Map;
    import java.util.Set;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    /**
     * @description:敏感词工具
     * @author: maojialong
     * @date: 2018年1月30日 上午10:59:24
     */
    public class SensitivewordEngine {
        
        private String ENCODING = "GBK";    //字符编码
        
        //敏感词库
        public static Map sensitiveWordMap = new HashMap();
    
        //只过滤最小敏感词
        public static int minMatchTYpe = 1;
    
        //过滤所有敏感词
        public static int maxMatchType = 2;
        
        //正则表达式中文、字母、数字
        public static Pattern pattern = Pattern.compile("^[a-zA-Z0-9u4E00-u9FA5]+$");
        
        /**
         * 读取敏感词库中的内容,将内容添加到set集合中
         * @author chenming 
         * @date 2014年4月20日 下午2:31:18
         * @return
         * @version 1.0
         * @throws Exception 
         */
        @SuppressWarnings("resource")
        private void readSensitiveWordFile() throws Exception{
            Set<String> set = null;
            
            File file = new File("D:\SensitiveWord.txt");    //读取文件
            InputStreamReader read = new InputStreamReader(new FileInputStream(file),ENCODING);
            try {
                if(file.isFile() && file.exists()){      //文件流是否存在
                    set = new HashSet<String>();
                    BufferedReader bufferedReader = new BufferedReader(read);
                    String txt = null;
                    while((txt = bufferedReader.readLine()) != null){    //读取文件,将文件内容放入到set中
                        set.add(txt);
                    }
                }
                else{         //不存在抛出异常信息
                    throw new Exception("敏感词库文件不存在");
                }
            } catch (Exception e) {
                throw e;
            }finally{
                read.close();     //关闭文件流
            }
            addNewSensitiveWord(set);
        }
        
        /**
         * @description: 新增敏感词库
         * @author: maojialong
         * @date: 2018年2月1日 上午11:55:10
         * @param keyWordSet
         */
        public static void addNewSensitiveWord(Set<String> keyWordSet) {
            sensitiveWordMap.putAll(getNewSensitiveWordToHashMap(keyWordSet));
        }
        
        /**
         * @description: 封装敏感词库
         * @author: maojialong
         * @date: 2018年1月30日 下午4:28:58
         * @param keyWordSet
         * @return
         */
        @SuppressWarnings("rawtypes")
        public static HashMap getNewSensitiveWordToHashMap(Set<String> keyWordSet) {
            // 初始化HashMap对象并控制容器的大小
            HashMap newSensitiveWordMap = new HashMap(keyWordSet.size());
            // 敏感词
            String key = null;
            // 用来按照相应的格式保存敏感词库数据
            Map nowMap = null;
            // 用来辅助构建敏感词库
            Map<String, String> newWorMap = null;
            // 使用一个迭代器来循环敏感词集合
            Iterator<String> iterator = keyWordSet.iterator();
            while (iterator.hasNext()) {
                key = iterator.next();
                // 等于敏感词库,HashMap对象在内存中占用的是同一个地址,所以此nowMap对象的变化,sensitiveWordMap对象也会跟着改变
                nowMap = sensitiveWordMap;
                for (int i = 0; i < key.length(); i++) {
                    // 截取敏感词当中的字,在敏感词库中字为HashMap对象的Key键值
                    char keyChar = key.charAt(i);
                    //不是汉字数字字母直接跳过
                    Matcher match = pattern.matcher(String.valueOf(keyChar));
                    boolean matched = match.matches();
                    if(!matched) {
                        continue;
                    }
    
                    // 判断这个字是否存在于敏感词库中
                    Object wordMap = nowMap.get(keyChar);
                    if (wordMap != null) {
                        nowMap = (Map) wordMap;
                    } else {
                        newWorMap = new HashMap<String, String>();
                        newWorMap.put("isEnd", "0");
                        nowMap.put(keyChar, newWorMap);
                        nowMap = newWorMap;
                    }
    
                    // 如果该字是当前敏感词的最后一个字,则标识为结尾字
                    if (i == key.length() - 1) {
                        nowMap.put("isEnd", "1");
                    }
                }
            }
            return newSensitiveWordMap;
        }
    
        /**
         * @description: 敏感词库敏感词数量
         * @author: maojialong
         * @date: 2018年1月30日 下午4:07:20
         * @return
         */
        public static int getWordSize() {
            if (SensitivewordEngine.sensitiveWordMap == null) {
                return 0;
            }
            return SensitivewordEngine.sensitiveWordMap.size();
        }
    
        /**
         * @description: 是否包含敏感词
         * @author: maojialong
         * @date: 2018年1月30日 下午2:47:37
         * @param txt
         * @param matchType
         * @return
         */
        public static boolean isContaintSensitiveWord(String txt, int matchType) {
            boolean flag = false;
            for (int i = 0; i < txt.length(); i++) {
                int matchFlag = checkSensitiveWord(txt, i, matchType);
                if (matchFlag > 0) {
                    flag = true;
                }
            }
            return flag;
        }
    
        /**
         * @description: 获取敏感词内容
         * @author: maojialong
         * @date: 2018年1月30日 下午2:47:27
         * @param txt
         * @param matchType
         * @return
         */
        public static Set<String> getSensitiveWord(String txt, int matchType) {
            Set<String> sensitiveWordList = new HashSet<String>();
    
            for (int i = 0; i < txt.length(); i++) {
                int length = checkSensitiveWord(txt, i, matchType);
                if (length > 0) {
                    // 将检测出的敏感词保存到集合中
                    sensitiveWordList.add(txt.substring(i, i + length));
                    i = i + length - 1;
                }
            }
    
            return sensitiveWordList;
        }
    
        /**
         * @description: 替换敏感词
         * @author: maojialong
         * @date: 2018年1月30日 下午2:47:15
         * @param txt
         * @param matchType
         * @param replaceChar
         * @return
         */
        public static String replaceSensitiveWord(String txt, int matchType, String replaceChar) {
            String resultTxt = txt;
            Set<String> set = getSensitiveWord(txt, matchType);
            Iterator<String> iterator = set.iterator();
            String word = null;
            String replaceString = null;
            while (iterator.hasNext()) {
                word = iterator.next();
                replaceString = getReplaceChars(replaceChar, word.length());
                try {
                    resultTxt = resultTxt.replaceAll(word, replaceString);
                }catch(Exception e) {
                }
            }
    
            return resultTxt;
        }
    
        /**
         * @description: 获取替换字符
         * @author: maojialong
         * @date: 2018年1月30日 下午2:46:40
         * @param replaceChar
         * @param length
         * @return
         */
        private static String getReplaceChars(String replaceChar, int length) {
            String resultReplace = replaceChar;
            for (int i = 1; i < length; i++) {
                resultReplace += replaceChar;
            }
    
            return resultReplace;
        }
    
        /**
         * @description: 检查敏感词
         * @author: maojialong
         * @date: 2018年1月30日 下午2:45:50
         * @param txt
         * @param beginIndex
         * @param matchType
         * @return
         */
        public static int checkSensitiveWord(String txt, int beginIndex, int matchType) {
            boolean flag = false;
            // 记录敏感词数量
            int matchFlag = 0;
            char word = 0;
            Map nowMap = SensitivewordEngine.sensitiveWordMap;
            for (int i = beginIndex; i < txt.length(); i++) {
                word = txt.charAt(i);
                if(matchFlag > 0 && !flag ) {
                    Matcher match = pattern.matcher(String.valueOf(word));
                    boolean matched = match.matches();
                    if(!matched) {
                        matchFlag++;
                        continue;
                    }
                }
                // 判断该字是否存在于敏感词库中
                nowMap = (Map) nowMap.get(word);
                if (nowMap != null) {
                    matchFlag++;
                    // 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测
                    if ("1".equals(nowMap.get("isEnd"))) {
                        flag = true;
                        // 判断过滤类型,如果是小过滤则跳出循环,否则继续循环
                        if (SensitivewordEngine.minMatchTYpe == matchType) {
                            break;
                        }
                    }
                } else {
                    break;
                }
            }
            if (matchFlag < 2 || !flag) {
                matchFlag = 0;
            }
            return matchFlag;
        }
        
        /**
         * @description: 删除敏感词
         * @author: maojialong
         * @date: 2018年2月1日 上午11:40:45
         * @param keyWord
         */
        public static void removeSensitiveWordToHashMap(String keyWord) {
            int length = keyWord.length();
            Map<Integer,Map> tempMap = new HashMap<Integer,Map>();
            char word = 0;
            boolean flag = false;
            Map nowMap = sensitiveWordMap;
            for(int i = 0; i < length ; i++){
                word = keyWord.charAt(i);
                Map lastMap = nowMap;
                nowMap = (Map) nowMap.get(word);     //获取指定key
                if(nowMap != null){     //存在,则判断是否为最后一个
                    tempMap.put(i, lastMap);
                }else{     //不存在,直接返回
                    break;
                }
                if (i == length -1 && "1".equals(nowMap.get("isEnd"))) {
                    flag = true;
                }
            }
            if(flag) {
                for(int i = length - 1; i >= 0 ; i--){
                    word = keyWord.charAt(i);
                    nowMap = tempMap.get(i); 
                    Map m = (Map) nowMap.get(word);
                    boolean last = m.size() == 1 && "1".equals(m.get("isEnd")) && i == length - 1;
                    boolean notLast = m.size() == 1 && "0".equals(m.get("isEnd"));
                    if(last || notLast) {
                        nowMap.remove(keyWord.charAt(i));
                    }else {
                        break;
                    }
                }
            }
        }
        
        public static void main(String[] args) throws InterruptedException {
            Set<String> sensitiveWord = new HashSet<String>();
            sensitiveWord.add("大娃");
            SensitivewordEngine.addNewSensitiveWord(sensitiveWord);
            String result = SensitivewordEngine.replaceSensitiveWord("我是大娃,我弟弟是大二娃,我们都是葫芦娃", 2,"*");
            System.out.println(result);
            System.out.println(SensitivewordEngine.sensitiveWordMap);
            
            //新增或者批量新增
            sensitiveWord.add("大二娃");
            sensitiveWord.add("大二");
            SensitivewordEngine.addNewSensitiveWord(sensitiveWord);
            result = SensitivewordEngine.replaceSensitiveWord("我是大娃,我弟弟是大二娃,我们现在读大二,我们都是葫芦娃", 2,"*");
            System.out.println(result);
            System.out.println(SensitivewordEngine.sensitiveWordMap);
            
            //删除
            SensitivewordEngine.removeSensitiveWordToHashMap("大二娃");
            result = SensitivewordEngine.replaceSensitiveWord("我是大娃,我弟弟是大二娃,我们现在读大二,我们都是葫芦娃", 2,"*");
            System.out.println(result);
            System.out.println(SensitivewordEngine.sensitiveWordMap);
            
        }
        
        
    }

    复制代码即可食用,最后的removeSensitiveWordToHashMap是我一个朋友帮忙写的,其他方法时参考网上的其他博文整理的

  • 相关阅读:
    数据库删除Push
    MFC Tab Control控件的详细使用
    mysql char和varchar的长度
    PostgreSQL常用脚本
    Linux常用操作命令
    PostgreSQL杀掉会话
    PostgresSQL备份还原
    Grpc客户端添加请求头(Header)
    Polly重试
    名词探疑3:I18N,L10N,A11Y
  • 原文地址:https://www.cnblogs.com/angry-scholar/p/8442191.html
Copyright © 2020-2023  润新知