• 敏感词汇过滤DFA算法


    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace SensitiveWordFilter
    {
        public class SensitiveWord
        {
            private static readonly char IsEndChar = '$';
    
            /**
             * 初始化敏感词库<br>
             * 将敏感词加入到HashMap中<br>
             * 构建DFA算法模型
             * 
             * @author dxm
             * 
             */
            public class SensitiveWordInit
            {
    
                // 字符编码
                private static readonly  String ENCODING = "UTF-8";
    
                /**
                 * 初始化敏感字库
                 * 
                 * @return
                 */
                public Dictionary<char, object> initKeyWord()
                {
    
                    // 读取敏感词库
                    HashSet<String> wordSet = readSensitiveWordFile();
    
                    // 将敏感词库加入到HashMap中
                    return addSensitiveWordToHashMap(wordSet);
                }
    
                /**
                 * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br>
                 * 中 = { 
                 *       isEnd = 0 
                 *       国 = {
                 *             isEnd = 1 
                 *             人 = { 
                 *                   isEnd = 0 
                 *                   民 = {
                 *                         isEnd = 1 
                 *                   }
                 *             } 
                 *             男 = { 
                 *                   isEnd = 0 
                 *                   人 = { 
                 *                         isEnd = 1 
                 *                   } 
                 *             } 
                 *       } 
                 * } 
                 * 五 = { 
                 *       isEnd = 0 
                 *       星 = { 
                 *             isEnd = 0 
                 *             红 = { 
                 *                    isEnd = 0 
                 *                    旗 = { 
                 *                           isEnd = 1 
                 *                    }
                 *              } 
                 *       } 
                 * }
                 */
                private Dictionary<char, object> addSensitiveWordToHashMap(HashSet<String> wordSet)
                {
    
                    // 初始化敏感词容器,减少扩容操作
                    Dictionary<char, object> wordMap = new Dictionary<char, object>(wordSet.Count);
    
                    foreach (String word in wordSet)
                    {
                        IDictionary<char, object> nowMap = wordMap;
                        for (int i = 0; i < word.Length; i++)
                        {
    
                            // 转换成char型
                            char keyChar = word[i];
    
                            if (keyChar == IsEndChar)
                                continue;
    
                            Object tempMap;
                            // 获取
                            nowMap.TryGetValue(keyChar, out tempMap);
    
                            // 如果存在该key,直接赋值
                            if (tempMap != null)
                            {
                                nowMap = (Dictionary<char, object>)tempMap;
                            }
    
                            // 不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个
                            else {
    
                                // 设置标志位
                                Dictionary<char, object> newMap = new Dictionary<char, object>();
                                newMap.Add(IsEndChar, "0");
    
                                // 添加到集合
                                nowMap.Add(keyChar, newMap);
                                nowMap = newMap;
                            }
    
                            // 最后一个
                            if (i == word.Length - 1)
                            {
                                nowMap[IsEndChar] = "1";
                            }
                        }
                    }
    
                    return wordMap;
                }
    
                /**
                 * 读取敏感词库中的内容,将内容添加到SortedSet集合中
                 * 
                 * @return
                 * @throws Exception
                 */
                private HashSet<String> readSensitiveWordFile()
                {
                    HashSet<String> wordSet = new HashSet<string>();
                    string content = File.ReadAllText("dic.txt", Encoding.GetEncoding(ENCODING));
                    using (StringReader sr = new StringReader(content))
                    {
                        string s;
                        while ((s = sr.ReadLine()) != null)
                        {
                            wordSet.Add(s);
                        }
                    }
                    return wordSet;
                }
            }
    
            public class SensitivewordFilter
            {
    
                private Dictionary<char, object> sensitiveWordMap = null;
    
                // 最小匹配规则
                public static int minMatchTYpe = 1;
    
                // 最大匹配规则
                public static int maxMatchType = 2;
    
                // 单例
                private static SensitivewordFilter inst = null;
    
                /**
                 * 构造函数,初始化敏感词库
                 */
                private SensitivewordFilter()
                {
                    sensitiveWordMap = new SensitiveWordInit().initKeyWord();
                }
    
                /**
                 * 获取单例
                 * 
                 * @return
                 */
                public static SensitivewordFilter getInstance()
                {
                    if (null == inst)
                    {
                        inst = new SensitivewordFilter();
                    }
                    return inst;
                }
    
                /**
                 * 判断文字是否包含敏感字符
                 * 
                 * @param txt
                 * @param matchType
                 * @return
                 */
                public bool isContaintSensitiveWord(String txt, int matchType = 1)
                {
                    bool flag = false;
                    for (int i = 0; i < txt.Length; i++)
                    {
    
                        // 判断是否包含敏感字符
                        int matchFlag = this.CheckSensitiveWord(txt, i, matchType);
    
                        // 大于0存在,返回true
                        if (matchFlag > 0)
                        {
                            flag = true;
                        }
                    }
                    return flag;
                }
    
                /**
                 * 获取文字中的敏感词
                 * 
                 * @param txt
                 * @param matchType
                 * @return
                 */
                public HashSet<String> getSensitiveWord(String txt, int matchType = 1)
                {
                    HashSet<String> sensitiveWordList = new HashSet<String>();
    
                    for (int i = 0; i < txt.Length; i++)
                    {
    
                        // 判断是否包含敏感字符
                        int length = CheckSensitiveWord(txt, i, matchType);
    
                        // 存在,加入list中
                        if (length > 0)
                        {
                            sensitiveWordList.Add(txt.Substring(i, length));
    
                            // 减1的原因,是因为for会自增
                            i = i + length - 1;
                        }
                    }
    
                    return sensitiveWordList;
                }
    
                /**
                 * 替换敏感字字符
                 * 
                 * @param txt
                 * @param matchType
                 * @param replaceChar
                 * @return
                 */
                public String replaceSensitiveWord(String txt, String replaceChar, int matchType = 1)
                {
                    StringBuilder sb = new StringBuilder(txt);
                    for (int i = 0; i < txt.Length; i++)
                    {
    
                        // 判断是否包含敏感字符
                        int length = CheckSensitiveWord(txt, i, matchType);
    
                        // 存在,加入list中
                        if (length > 0)
                        {
                            var ttxt = txt.Substring(i, length);
                            sb.Replace(ttxt, getReplaceChars(replaceChar, ttxt.Length), i, length);
    
                            // 减1的原因,是因为for会自增
                            i = i + length - 1;
                        }
                    }
    
                    return sb.ToString();
                }
    
                /**
                 * 获取替换字符串
                 * 
                 * @param replaceChar
                 * @param length
                 * @return
                 */
                private String getReplaceChars(String replaceChar, int length)
                {
                    StringBuilder sb = new StringBuilder();
                    for (int i = 0; i < length; i++)
                    {
                        sb.Append(replaceChar);
                    }
    
                    return sb.ToString();
                }
    
                /**
                 * 检查文字中是否包含敏感字符,检查规则如下:<br>
                 * 如果存在,则返回敏感词字符的长度,不存在返回0
                 * 
                 * @param txt
                 * @param beginIndex
                 * @param matchType
                 * @return
                 */
                public int CheckSensitiveWord(String txt, int beginIndex, int matchType)
                {
    
                    // 敏感词结束标识位:用于敏感词只有1位的情况
                    bool flag = false;
    
                    // 匹配标识数默认为0
                    int matchFlag = 0;
                    Dictionary<char, object> nowMap = sensitiveWordMap;
                    int tempFlag = 0;
                    Dictionary<char, object> tempMapForBack = new Dictionary<char, object>();
                    int len = txt.Length;
                    for (int i = beginIndex; i < len; i++)
                    {
                        char word = txt[i];
    
                        if (word == IsEndChar)
                            continue;
    
                        // 获取指定key
                        Object tempMap;
                        // 获取
                        nowMap.TryGetValue(word, out tempMap);
    
                        if (tempFlag == 0)
                            tempMapForBack = nowMap;
    
                        // 如果存在该key,直接赋值
                        if (tempMap != null)
                        {
                            nowMap = (Dictionary<char, object>)tempMap;
                        }
                        else
                        {
                            if (tempFlag > 0)
                            {
                                matchFlag = matchFlag - (i - tempFlag);
                                i = tempFlag - 1;
                                nowMap = tempMapForBack;
                                continue;
                            }
                            else
                            {
                                nowMap = null;
                            }
                        }
    
                        // 存在,则判断是否为最后一个
                        if (nowMap != null)
                        {
    
                            // 找到相应key,匹配标识+1
                            matchFlag++;
    
                            object value;
    
                            if (nowMap.TryGetValue(IsEndChar, out value))
                            {
                                if (value is string)
                                {
                                    // 如果为最后一个匹配规则,结束循环,返回匹配标识数
                                    if ("1" == (string)value)
                                    {
                                        if (nowMap.Keys.Count == 1 || tempFlag != 0 || i == len - 1)
                                        {
                                            // 结束标志位为true
                                            flag = true;
    
                                            // 最小规则,直接返回,最大规则还需继续查找
                                            if (SensitivewordFilter.minMatchTYpe == matchType)
                                            {
                                                break;
                                            }
                                        }
                                        else
                                        {
                                            tempFlag = i;
                                        }
                                    }
                                }
                            }
                        }
                        // 不存在,直接返回
                        else
                        {
                            break;
                        }
                    }
    
                    // 长度必须大于等于1,为词
                    if (matchFlag < 2 || !flag)
                    {
                        matchFlag = 0;
                    }
                    return matchFlag;
                }
            }
        }
    }
    
    
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace SensitiveWordFilter
    {
        class Program
        {
            static void Main(string[] args)
            {
                SensitiveWord.SensitivewordFilter filter = SensitiveWord.SensitivewordFilter.getInstance();
                String txt = "$fuckfuck you你麻痹e菜太菜了fuckyou从飞啊 fuck you";
                String hou = filter.replaceSensitiveWord(txt, "*");
                Console.WriteLine("替换前的文字为:" + txt);
                Console.WriteLine("替换后的文字为:" + hou);
                Console.ReadKey();
            }
        }
    }
  • 相关阅读:
    为 HTTP/2 头压缩专门设计的 HPACK
    HTTP2 帧基础知识以及Header、CONTINUATION、DATA帧相关资料:
    扩充巴科斯-瑙尔范式 ABNF简介
    我Win下常用工具清单
    gRPC版本的 Google APIs
    gRPC 的route_guide例子
    proto3 笔记1
    编译gRPC Go版本使用的 ProtoBuffer 文件
    新浪校园招聘2013.10.30浙大玉泉4教301笔试的前端妹纸,像雾像雨又像风
    Android下Notification,样式style,主题theme的功能实现
  • 原文地址:https://www.cnblogs.com/caozhiyuan/p/9425091.html
Copyright © 2020-2023  润新知