• C# 屏蔽词过滤


    参考:https://www.cnblogs.com/kubidemanong/p/10834993.html

    public class TreeNode
        {
            public char Char;
            public bool IsEnd;
            public int WordEndAt;
            private Dictionary<char, TreeNode> NodeDict;
            public TreeNode(char c)
            {
                Char = c;
                IsEnd = false;
                NodeDict = new Dictionary<char, TreeNode>();
            }
    
            public bool ContainChar(char ch)
            {
                return NodeDict.ContainsKey(ch);
            }
    
            public TreeNode GetChild(char c)
            {
                TreeNode Child = null;
                NodeDict.TryGetValue(c, out Child);
                return Child;
            }
    
            public TreeNode AddNode(char ch)
            {
                TreeNode insertNode = null;
                if (!NodeDict.TryGetValue(ch, out insertNode))
                {
                    insertNode = new TreeNode(ch);
                    NodeDict.Add(ch, insertNode);
                }
                return insertNode;
            }
        }
    public class Trie
    {
    
            public TreeNode Root { get; }
            private HashSet<char> SkipCharSet;      //忽略字符
            private HashSet<char> SeparateCharSet;  //常见分隔符
            private string skipCharList = " `-=[]\',.·/~!@#$%^&*()_+{}|:"<>?*
    ";
            private TreeNode checkNode;
            private TreeNode backupNode;
            private char duplicateChar;
            public Trie()
            {
                Root = new TreeNode(' ');
                SkipCharSet = new HashSet<char>();
                SeparateCharSet = new HashSet<char>();
                foreach (char c in skipCharList)
                {
                    SkipCharSet.Add(c);
                    SeparateCharSet.Add(c);
                }
            }
    
            public void AddNode(TreeNode node, string word)
            {
                if (word.Length > 0)
                {
                    char ch = word[0];
                    TreeNode insertNode = node.AddNode(ch);
                    if (word.Length == 1)
                    {
                        insertNode.IsEnd = true;
                    }
                    word = word.Remove(0,1);
                    AddNode(insertNode, word);
                }
            }
    
            private bool IsSkipChar(char c)
            {
                return SkipCharSet.Contains(c);
            }
    
            //是否是英文
            private bool IsEnglishChar(char ch)
            {
                string str = new string(ch, 1);
                Regex regEnglish = new Regex("^[a-zA-Z]");
                return regEnglish.IsMatch(str);
            }
    
            //是否是分割符
            private bool Separator(char ch)
            {
                return SeparateCharSet.Contains(ch);
            }
    
            //是否是单词的开头
            private bool IsWordBegin(string word,int pos)
            {
                if (pos == 0)
                    return true;
                if (pos < word.Length)
                {
                    char c1 = word[pos - 1];
                    char c2 = word[pos];
                    return (Separator(c1) && !Separator(c2)) || (!IsEnglishChar(c1) && IsEnglishChar(c2));
                }
                return false;
            }
    
            //是否是单词的结尾
            private bool IsWordEnd(string word,int pos)
            {
                if (pos == word.Length - 1)
                    return true;
                if(pos < word.Length)
                {
                    char c1 = word[pos];
                    char c2 = word[pos + 1];
                    return (!Separator(c1) && Separator(c2)) || (IsEnglishChar(c1) && !IsEnglishChar(c2));
                }
                return false;
            }
    
            private void CheckWord(string checkWord,int begin)
            {
                int index = begin;
                while(index + 1 < checkWord.Length)
                {
                    ++index;
                    char ch = checkWord[index];
                    if (IsSkipChar(ch))
                    {
                        if (checkNode.ContainChar(ch))
                            checkNode = checkNode.GetChild(ch);
                    }
                    else
                    {
                        if(checkNode.ContainChar(ch))
                        {
                            checkNode = checkNode.GetChild(ch);
                            if(checkNode.IsEnd)
                            {
                                checkNode.WordEndAt = index;
                                backupNode = checkNode;
                                duplicateChar = ch;             
                                CheckWord(checkWord,index);     //继续匹配
                                break;
                            }
                        }
                        else
                        {
                            if (duplicateChar == ch)            //屏蔽fuccccccck例如这样的
                                backupNode.WordEndAt = index;
                            else
                                break;
                        }
                    }
                    duplicateChar = ch;
                }
            }
    
            public string Filter(string filterWord)
            {
                int begin = 0;
                checkNode = null;
                backupNode = null;
                duplicateChar = ' ';
                string word = filterWord.ToLower();
                StringBuilder result = new StringBuilder(filterWord);
                while(begin < word.Length)
                {
                    checkNode = Root;
                    backupNode = Root;
                    char ch = word[begin];
                    duplicateChar = ch;
                   //设置是否严格匹配, 即逐个字符检测是否有可能是敏感词,否则像英语一样只检测单词, 
    //
    严格匹配: [av]是敏感词,那么[avoid]被替换成[**oid]
                    bool isStrict = !IsEnglishChar(ch);              
                    bool isWordBegin = isStrict || IsWordBegin(word, begin);
                    if(isWordBegin && checkNode.ContainChar(ch))
                    {
                        checkNode = checkNode.GetChild(ch);
                        if(!IsSkipChar(ch))
                        {
                            CheckWord(word, begin);
                            if(backupNode.IsEnd && backupNode.WordEndAt > 0)
                            {
                                bool isWordEnd = isStrict || IsWordEnd(word, backupNode.WordEndAt); //到单词末尾才行 have 中有av 但是不是末尾 所以不是屏蔽词
                                if(isWordEnd)
                                {
                                    for(int i = begin; i <= backupNode.WordEndAt;++i)
                                    {
                                        result[i] = '*';
                                    }
                                    begin = backupNode.WordEndAt;
                                }
                            }
                        }
                    }
                    ++begin;
                }
    
                return result.ToString();
            }
     }

    测试用例:

     class Program
        {
            static void Main(string[] args)
            {
                Trie trie = new Trie();
                trie.AddNode(trie.Root, "fuc");
                trie.AddNode(trie.Root, "fuc bitch");
                trie.AddNode(trie.Root, "fuck");
                trie.AddNode(trie.Root, "bitch");
                trie.AddNode(trie.Root, "屠杀");
                Console.WriteLine(trie.Filter("是哦fuckkkkkk山大的撒bi	ch"));
                Console.WriteLine(trie.Filter("have world fuc bitch"));
                Console.WriteLine(trie.Filter("1218fuck1"));
    
                Console.ReadKey();
            }
    }

    结果:

  • 相关阅读:
    linux内核启动汇编部分详解
    linux内核zImage详解
    Linux内核zImage怎么来的?
    Linux内核编译make做了什么?
    关于makefile的几点经验
    note
    tmp0000
    tmp
    SSL学习与总结
    C++学习笔记
  • 原文地址:https://www.cnblogs.com/darkif/p/13159604.html
Copyright © 2020-2023  润新知