• AC自动机


    要学会AC自动机,我们必须知道什么是Trie,也就是字典树。最好对KMP算法也有些了解。Trie树和KMP算法我之前博客都有写过,感兴趣的可以看看。


    简单叙述下问题,现在给出
    "hsay";
    "ah";
    "sahe";
    "he";
    "say";
    "herhb";
    "aher";
    "erhs"

    共8个关键词,要问字符串"yasaherhsay"中这8个关键词有几个出现过。

    答案是7。

    这就是一个多模式匹配问题。


    AC自动机算法分为3步:构造一棵Trie树,构造失败指针和模式匹配过程。

    失败指针和KMP算法中的next函数或称shift函数的功能类似。


                    

    上图解释了失败指针的作用。

    // AC_automachine.cpp : 定义控制台应用程序的入口点。
    //
    
    #include "stdafx.h"
    #include<vector>
    #include<algorithm>
    #include<set>
    #include<iostream>  
    
    using namespace std;
    
    #define MAXSIZE 26  
    
    
    
    struct TrieNode
    {
    	TrieNode* next[MAXSIZE];
    	TrieNode*parent;
    	vector<TrieNode*>fail;
    	char p;
    	int Num;
    	bool isword;
    };
    
    set<string>re;//保存结果
    
    TrieNode*initiate_Trie()
    {
    	TrieNode*root = new TrieNode;
    	for (int i = 0; i < MAXSIZE; i++)
    		root->next[i] = NULL;
    	root->Num = 0;
    	root->parent = NULL;
    	root->isword = false;
    	return root;
    
    
    }
    
    bool search(TrieNode*root, char*str)
    {
    	TrieNode*tn;
    	tn = root;
    	int k;
    	while (*str != '')
    	{
    		k = *str - 'a';
    		if (tn->next[k] == NULL)
    			return false;
    		tn = tn->next[k];
    		str++;
    	}
    	if (tn->isword == false)
    		return false;
    	return true;
    }
    
    TrieNode*build_Trie_singleword(TrieNode*root, char*str)
    {
    	if (search(root, str))
    		return root;
    	root->Num = root->Num + 1;
    	TrieNode*tn;
    	tn = root;
    	while (*str != '')
    	{
    		int k = *str - 'a';
    		if (tn->next[k] == NULL)
    		{
    			tn->next[k] = new TrieNode;
    			for (int i = 0; i < MAXSIZE; i++)
    			{
    				tn->next[k]->next[i] = NULL;
    			}
    			tn->next[k]->p = *str;
    			tn->next[k]->Num = 1;
    			tn->next[k]->parent = tn;
    			tn->next[k]->isword = false;
    		}
    		else
    		{
    			tn->next[k]->Num = tn->next[k]->Num + 1;
    		}
    		tn = tn->next[k];
    		str++;
    	}
    	tn->isword = true;
    	return root;
    }
    
    void initiate_fail_pointer(TrieNode*root, TrieNode*node)
    {
    	//if (node == NULL)
    	//	return;
    	if (node == root)
    	{
    		for (int i = 0; i < MAXSIZE; i++)
    			if (root->next[i] != NULL)
    				initiate_fail_pointer(root, root->next[i]);
    	}
    	else
    	{
    		cout << node->p;
    		TrieNode*n = node;
    		vector<char>ss;
    		ss.push_back(node->p);
    		vector<TrieNode*>::iterator result = find(node->fail.begin(), node->fail.end(), root->next[node->p - 'a']); //查找
    		if (root->next[node->p - 'a'] != NULL&&result == node->fail.end() && root->next[node->p - 'a'] != node)
    			node->fail.push_back(root->next[node->p - 'a']);
    
    		while (n->parent != root)
    		{
    			TrieNode*mm = root;
    			ss.push_back(n->parent->p);
    			int i;
    			for (i = ss.size() - 1; i >= 0; i--)
    				if (mm->next[ss[i] - 'a'] != NULL)
    					mm = mm->next[ss[i] - 'a'];
    				else
    					break;
    			if (i == -1 && mm != node)
    			{
    				result = find(node->fail.begin(), node->fail.end(), mm);
    				if (result == node->fail.end())
    					node->fail.push_back(mm);
    			}
    			n = n->parent;
    		}
    
    
    
    		for (int i = 0; i < MAXSIZE; i++)
    			if (node->next[i] != NULL)
    				initiate_fail_pointer(root, node->next[i]);
    	}
    
    
    }
    
    
    int AC_automachine(TrieNode*root, char*str)
    {
    	int count = 0;
    	int len = strlen(str);
    	int k = 0;
    	
    
    	while (k < len)
    	{
    		while (root->next[str[k] - 'a'] == NULL)
    		{
    			k++;
    		}
    
    		TrieNode*p,*node = root->next[str[k] - 'a'];
    		p = NULL;
    		while (node != NULL)
    		{
    			if (node->isword == true)
    			{
    				string aa;
    				TrieNode*nn = node;
    				while (nn != root)
    				{
    					aa += nn->p;
    					nn = nn->parent;
    				}
    				std::reverse(aa.begin(), aa.end());
    				if (re.find(aa) == re.end())
    				{
    					re.insert(aa);
    					count++;
    				}
    			}
    			if (!node->fail.empty())
    			{
    				for (int i = 0; i < node->fail.size(); i++)
    					if (node->fail[i]->isword)
    					{
    						string aa;
    						TrieNode*nn = node->fail[i];
    						while (nn != root)
    						{
    							aa += nn->p;
    							nn = nn->parent;
    						}
    						std::reverse(aa.begin(), aa.end());
    						if (re.find(aa) == re.end())
    						{
    							re.insert(aa);
    							count++;
    						}
    					}
    			}
    			k++;
    			p = node;
    			node = node->next[str[k] - 'a'];
    		}
    
    		k--;
    		node = p;
    		_ASSERT(node);
    		if (node->fail.empty())
    		{
    			k++;
    		}
    		else
    		{
    			int max = 0;
    			TrieNode*tn, *tp;
    			tn = NULL;
    			int kk;
    			for (int i = 0; i < node->fail.size(); i++)
    			{
    				kk = 0;
    				tp = node->fail[i];
    				while (tp != NULL)
    				{
    					if (tp->isword)
    					{
    						string aa;
    						TrieNode*nn = tp;
    						while (nn != root)
    						{
    							aa += nn->p;
    							nn = nn->parent;
    						}
    						std::reverse(aa.begin(), aa.end());
    						if (re.find(aa) == re.end())
    						{
    							re.insert(aa);
    							count++;
    						}
    					}
    					if (!tp->fail.empty())
    					{
    						for (int i = 0; i < tp->fail.size(); i++)
    							if (tp->fail[i]->isword)
    							{
    								string aa;
    								TrieNode*nn = tp->fail[i];
    								while (nn != root)
    								{
    									aa += nn->p;
    									nn = nn->parent;
    								}
    								std::reverse(aa.begin(), aa.end());
    								if (re.find(aa) == re.end())
    								{
    									re.insert(aa);
    									count++;
    								}
    							}
    					}
    					kk++;
    					p = tp;
    					tp = tp->next[str[k + kk] - 'a'];
    				}
    				if (kk > max)
    				{
    					max = kk;
    					tn = p;
    					_ASSERT(tn);
    				}
    			}
    			if (!tn->fail.empty())
    			{
    				int maxlen=0;
    				for (int i = 0; i < tn->fail.size(); i++)
    				{
    					TrieNode*mm = tn->fail[i];
    					int kkk = 0;
    					while (mm != root)
    					{
    						mm = mm->parent;
    						kkk++;
    					}
    					if (kkk > maxlen)
    						maxlen = kkk;
    				}
    				k = k + kk - maxlen;
    			}
    			else
    			{
    				k = k + kk;
    			}
    		}//end of else
    	}
    
    	return count;
    
    }
    
    
    int _tmain(int argc, _TCHAR* argv[])
    {
    	TrieNode*root = initiate_Trie();
    	root = build_Trie_singleword(root, "hsay");
    	root = build_Trie_singleword(root, "ah");
    	root = build_Trie_singleword(root, "sahe");
    	root = build_Trie_singleword(root, "he");
    	root = build_Trie_singleword(root, "say");
    	root = build_Trie_singleword(root, "herhb");
    	root = build_Trie_singleword(root, "aher");
    	root = build_Trie_singleword(root, "erhs");
    	
    
    	initiate_fail_pointer(root, root);
    	cout << endl;
    	cout << AC_automachine(root, "yasaherhsay") << endl;
    
    	system("pause");
    	return 0;
    }
    



    版权声明:

  • 相关阅读:
    [CF1042F]Leaf Sets
    [CF1051F]The Shortest Statement
    [洛谷P1792][国家集训队]种树
    [CF484E]Sign on Fence
    [洛谷P2216][HAOI2007]理想的正方形
    [洛谷P4389]付公主的背包
    [洛谷P4726]【模板】多项式指数函数
    服务器上Ubuntu系统安装
    删除ubuntu系统
    Win10下安装Ubuntu16.04双系统
  • 原文地址:https://www.cnblogs.com/walccott/p/4956885.html
Copyright © 2020-2023  润新知