• JAVA敏感词过滤


    JAVA敏感词过滤

    一、初始化敏感词库

      1 import java.io.BufferedReader;
      2 import java.io.File;
      3 import java.io.FileInputStream;
      4 import java.io.InputStreamReader;
      5 import java.util.HashMap;
      6 import java.util.HashSet;
      7 import java.util.Iterator;
      8 import java.util.Map;
      9 import java.util.Set;
     10 
     11 /**
     12  * 初始化敏感词库,将敏感词加入到HashMap中,构建DFA算法模型
     13  */
     14 public class SensitiveWordInit {
     15     private String ENCODING = "utf-8";    //字符编码
     16     public HashMap sensitiveWordMap;
     17     public SensitiveWordInit(){
     18         super();
     19     }
     20 
     21     /**
     22      * 初始化
     23      */
     24     public Map initKeyWord(){
     25         try {
     26             //读取敏感词库
     27             Set<String> keyWordSet = readSensitiveWordFile();
     28             //将敏感词库加入到HashMap中
     29             addSensitiveWordToHashMap(keyWordSet);
     30             //spring获取application,然后application.setAttribute("sensitiveWordMap",sensitiveWordMap);
     31         } catch (Exception e) {
     32             e.printStackTrace();
     33         }
     34         return sensitiveWordMap;
     35     }
     36 
     37     /**
     38      * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br>
     39      * 中 = {
     40      *      isEnd = 0
     41      *      国 = {<br>
     42      *           isEnd = 1
     43      *           人 = {isEnd = 0
     44      *                民 = {isEnd = 1}
     45      *                }
     46      *           男  = {
     47      *                  isEnd = 0
     48      *                   人 = {
     49      *                        isEnd = 1
     50      *                       }
     51      *               }
     52      *           }
     53      *      }
     54      *  五 = {
     55      *      isEnd = 0
     56      *      星 = {
     57      *          isEnd = 0
     58      *          红 = {
     59      *              isEnd = 0
     60      *              旗 = {
     61      *                   isEnd = 1
     62      *                  }
     63      *              }
     64      *          }
     65      *      }
     66      */
     67     private void addSensitiveWordToHashMap(Set<String> keyWordSet) {
     68         sensitiveWordMap = new HashMap(keyWordSet.size());     //初始化敏感词容器,减少扩容操作
     69         String key = null;
     70         Map nowMap = null;
     71         Map<String, String> newWorMap = null;
     72         //迭代keyWordSet
     73         Iterator<String> iterator = keyWordSet.iterator();
     74         while(iterator.hasNext()){
     75             key = iterator.next();    //关键字
     76             nowMap = sensitiveWordMap;
     77             for(int i = 0 ; i < key.length() ; i++){
     78                 char keyChar = key.charAt(i);       //转换成char型
     79                 Object wordMap = nowMap.get(keyChar);       //获取
     80 
     81                 if(wordMap != null){        //如果存在该key,直接赋值
     82                     nowMap = (Map) wordMap;
     83                 }
     84                 else{     //不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个
     85                     newWorMap = new HashMap<String,String>();
     86                     newWorMap.put("isEnd", "0");     //不是最后一个
     87                     nowMap.put(keyChar, newWorMap);
     88                     nowMap = newWorMap;
     89                 }
     90 
     91                 if(i == key.length() - 1){
     92                     nowMap.put("isEnd", "1");    //最后一个
     93                 }
     94             }
     95         }
     96     }
     97 
     98     /**
     99      * 读取敏感词库中的内容,将内容添加到set集合中
    100      */
    101     @SuppressWarnings("resource")
    102     private Set<String> readSensitiveWordFile() throws Exception{
    103         Set<String> set = null;
    104         //https://github.com/heqiyoujing/config_file 词库地址
    105         File file = new File("D:\SensitiveWord.txt");    //读取文件
    106         InputStreamReader read = new InputStreamReader(new FileInputStream(file),ENCODING);
    107         try {
    108             if(file.isFile() && file.exists()){      //文件流是否存在
    109                 set = new HashSet<String>();
    110                 BufferedReader bufferedReader = new BufferedReader(read);
    111                 String txt = null;
    112                 while((txt = bufferedReader.readLine()) != null){    //读取文件,将文件内容放入到set中
    113                     set.add(txt);
    114                 }
    115             }
    116             else{         //不存在抛出异常信息
    117                 throw new Exception("敏感词库文件不存在");
    118             }
    119         } catch (Exception e) {
    120             throw e;
    121         }finally{
    122             read.close();     //关闭文件流
    123         }
    124         return set;
    125     }
    126 }
    View Code

    二、检查敏感词并替换

      1 import java.util.HashSet;
      2 import java.util.Iterator;
      3 import java.util.Map;
      4 import java.util.Set;
      5 
      6 /**
      7  * 敏感词过滤
      8  */
      9 public class SensitivewordFilter {
     10     private Map sensitiveWordMap = null;
     11     public static int minMatchTYpe = 1;      //最小匹配规则
     12     public static int maxMatchType = 2;      //最大匹配规则
     13     private static String replaceString = null;
     14     /**例如:敏感词中含有中国人、中国
     15      * 最小匹配规则minMatchTYpe为1时,会匹配出**人,为2时,会匹配出***
     16      * */
     17     public static void main(String[] args) throws Exception{
     18         SensitivewordFilter filter = new SensitivewordFilter();
     19         System.out.println("敏感词的数量:" + filter.sensitiveWordMap.size());
     20         String string = "dfa是面向三级装配的设计(Design for assembly)的英文简称,是指在产品设计阶段设计产品使得产品具有良好" +
     21                 "的可装配性,确保装配工序简单、装配效率高、装配质量高、装配不良率低和装配成本低。面向装配的设计通过一系" +
     22                 "列有利于装配的设计指南例如简化产品设计、减少零件数量等,女女并同装配工程师一起合作,被逼简化产品结构,近親使其便于" +
     23                 "装配,为提高产品质量、缩短产品开发周期和降低产品成本奠定基础";
     24         // ------获取敏感词---------
     25         Set<String> set = filter.getSensitiveWord(string, 1);
     26         System.out.println("含敏感词的个数为:" + set.size() + "。包含:" + set);
     27         // ------------------------替换敏感字begin----------------------
     28         Iterator<String> iterator = set.iterator();
     29         String word = null;
     30         while (iterator.hasNext()) {
     31             word = iterator.next();
     32             /**
     33              * 得到word中敏感关键词被替换后的字符串,例如:***
     34              * */
     35             getReplaceCharsS("*", word.length());
     36             /**
     37              * 将原字符串中的敏感关键词替换成带有replaceChar
     38              * 或全部为replaceChar的关键词
     39              * */
     40             string = string.replaceAll(word, replaceString);
     41         }
     42         // ------------------------替换敏感字end----------------------
     43         System.out.println(string);
     44     }
     45 
     46     /**
     47      * 构造函数,初始化敏感词库
     48      */
     49     public SensitivewordFilter(){
     50         sensitiveWordMap = new SensitiveWordInit().initKeyWord();
     51     }
     52 
     53     /**
     54      * 判断文字是否包含敏感字符
     55      * @param matchType  匹配规则&nbsp;1:最小匹配规则,2:最大匹配规则
     56      */
     57     public boolean isContaintSensitiveWord(String txt,int matchType){
     58         boolean flag = false;
     59         for(int i = 0 ; i < txt.length() ; i++){
     60             int matchFlag = this.CheckSensitiveWord(txt, i, matchType); //判断是否包含敏感字符
     61             if(matchFlag > 0){    //大于0存在,返回true
     62                 flag = true;
     63             }
     64         }
     65         return flag;
     66     }
     67 
     68     /**
     69      * 获取文字中的敏感词
     70      * @param matchType 匹配规则&nbsp;1:最小匹配规则,2:最大匹配规则
     71      */
     72     public Set<String> getSensitiveWord(String txt , int matchType){
     73         Set<String> sensitiveWordList = new HashSet<String>();
     74 
     75         for(int i = 0 ; i < txt.length() ; i++){
     76             int length = CheckSensitiveWord(txt, i, matchType);    //判断是否包含敏感字符
     77             if(length > 0){    //存在,加入list中
     78                 sensitiveWordList.add(txt.substring(i, i+length));
     79                 i = i + length - 1;    //减1的原因,是因为for会自增
     80             }
     81         }
     82 
     83         return sensitiveWordList;
     84     }
     85 
     86     /**
     87      * 替换敏感字字符,默认*
     88      */
     89     public String replaceSensitiveWord(String txt,int matchType,String replaceChar){
     90         String resultTxt = txt;
     91         Set<String> set = getSensitiveWord(txt, matchType);     //获取所有的敏感词
     92         Iterator<String> iterator = set.iterator();
     93         String word = null;
     94         String replaceString = null;
     95         while (iterator.hasNext()) {
     96             word = iterator.next();
     97             replaceString = getReplaceChars(replaceChar, word.length());
     98             resultTxt = resultTxt.replaceAll(word, replaceString);
     99         }
    100 
    101         return resultTxt;
    102     }
    103 
    104     /**
    105      * 获取替换字符串
    106      */
    107     private String getReplaceChars(String replaceChar,int length){
    108         String resultReplace = replaceChar;
    109         for(int i = 1 ; i < length ; i++){
    110             resultReplace += replaceChar;
    111         }
    112 
    113         return resultReplace;
    114     }
    115 
    116     /**
    117      * 获取替换字符串,无返回值
    118      */
    119     private static void getReplaceCharsS(String replaceChar,int length){
    120         replaceString = "";
    121         String resultReplace = replaceChar;
    122         for(int i = 1 ; i < length ; i++){
    123             resultReplace += replaceChar;
    124         }
    125         replaceString = resultReplace;
    126     }
    127 
    128     /**
    129      * 检查文字中是否包含敏感字符,检查规则如下:<br>
    130      */
    131     @SuppressWarnings({ "rawtypes"})
    132     public int CheckSensitiveWord(String txt,int beginIndex,int matchType){
    133         boolean  flag = false;    //敏感词结束标识位:用于敏感词只有1位的情况
    134         int matchFlag = 0;     //匹配标识数默认为0
    135         char word = 0;
    136         Map nowMap = sensitiveWordMap;
    137         for(int i = beginIndex; i < txt.length() ; i++){
    138             word = txt.charAt(i);
    139             nowMap = (Map) nowMap.get(word);     //获取指定key
    140             if(nowMap != null){     //存在,则判断是否为最后一个
    141                 matchFlag++;     //找到相应key,匹配标识+1
    142                 if("1".equals(nowMap.get("isEnd"))){       //如果为最后一个匹配规则,结束循环,返回匹配标识数
    143                     flag = true;       //结束标志位为true
    144                     if(SensitivewordFilter.minMatchTYpe == matchType){    //最小规则,直接返回,最大规则还需继续查找
    145                         break;
    146                     }
    147                 }
    148             }
    149             else{     //不存在,直接返回
    150                 break;
    151             }
    152         }
    153         if(matchFlag < 2 || !flag){        //长度必须大于等于1,为词
    154             matchFlag = 0;
    155         }
    156         return matchFlag;
    157     }
    158 
    159 }
    View Code

    三、运行结果

  • 相关阅读:
    appium之模拟坐标方法介绍
    mysql操作数据库常用命令
    appium使用无线连接手机方法
    mysql数据之增删改操作
    mysql之子查询与分组查询
    selenium之多个窗口之间切换
    selenium之内嵌网页iframe切换
    CF103E
    CF724E
    光伏元件
  • 原文地址:https://www.cnblogs.com/heqiyoujing/p/9259777.html
Copyright © 2020-2023  润新知