【模版 Luogu P3808/P3796/P5357】AC自动机（简论）

【模版 Luogu P3808/P3796/P5357】AC自动机（简论）
　　浙江集训Day9，没有出任何实质性成果，只好把昨天打完的板子记一下。

　　该博客基于luogu的三道模版题。只有一个大致的讲解，主要提供代码给自己参考。

-----------------------------------------------------------------------

(7.14)

一、AC自动机

　　AC自动机，一个有着令人容易误会的名字的有限状态自动机结构，主要被应用在多模式串的文本匹配问题中。理解AC自动机，首先要熟悉KMP算法和字典树。使用KMP可以分开对每个模式串进行计数，但是对目标串的扫描次数会爆炸。实际上，KMP算法本身也可以从有限状态自动机的角度来理解（简单理解大概就是跳转能够达到的状态是有限个）。AC自动机与KMP都含有类似的fail指针结构。通俗理解fail的意义，我们先建立原串的trie树（KMP则就是原串），然后预处理出每个节点“在当前点a匹配不下去了，我要找一个字典树的前缀串是该串的后缀串”代表的b（这样的意义是，匹配到a就一定匹配到了b）。这就是构建自动机的过程。平凡的ACAM在匹配时，我们沿着字典树往下走，下一个字符失配就沿着fail边去跳转它的后缀代表的匹配状态，直到找到一个可以匹配文本串的下个字符的后缀状态为止。同时，每找到一个串，我们就要沿着fail边翻出它的后缀串，因为这些后缀也都被匹配到了。这就导致AC自动机的匹配复杂度有了可优化的空间。

　　所谓的trie图优化，就是在建立AC自动机时直接把失配的那个字符对应的边连到目标后缀上，这样可以省去每次失配跳fail边的麻烦。同时，一个串有很多后缀，但是并没有都出现在模式串中，中间空状态的跳转没有意义；那么我们就新开一个数组记录下它的第一个是结束节点的fail目标状态（即这个后缀存在于模式串中）的位置，然后每次沿着这个边跳转即可。

　　代码明天放，顺便安利让我学会AC自动机的dalao的两篇博客，受益匪浅。

　　https://www.cnblogs.com/sclbgw7/p/9260756.html　　（AC自动机的构建）

　　https://www.cnblogs.com/sclbgw7/p/9875671.html　　（AC自动机的两种优化）

---------------------------------------------------

(7.15)

二、代码

　　模版一：统计出现模式串的个数
1. #include <iostream>
2. #include <cstdio>
3. #include <cstring>
4. #include <queue>
5. #define BUG puts("findone")
6. #define maxn 1000000 + 10
7. template <typename T>
8. void read(T &x) {
9. x = 0;
10. int f = 1;
11. char ch = getchar();
12. while (!isdigit(ch)) {
13. if (ch == '-')
14. f = -1;
15. ch = getchar();
16. }
17. while (isdigit(ch)) {
18. x = x * 10 + (ch ^ 48);
19. ch = getchar();
20. }
21. x *= f;
22. return;
23. }
24. using namespace std;
25. char s[maxn];
26. namespace ACAM {
27. int trie[26][maxn], pi[maxn], cnt[maxn], last[maxn];
28. const int root(1);
29. int tot = 1;
30. void Insert(char *s) {
31. int nd = root, len = strlen(s);
32. for (int i = 0; i < len; ++i) {
33. int c = s[i] - 'a';
34. if (!trie[c][nd])
35. trie[c][nd] = ++tot;
36. nd = trie[c][nd];
37. }
38. ++cnt[nd];
39. }
40. void Build_ACAM() {
41. for (int i = 0; i < 26; ++i)
42. trie[i][0] = root;
43. pi[root] = 0;
44. queue<int> que;
45. que.push(root);
46. while (!que.empty()) {
47. int nd = que.front();
48. que.pop();
49. for (int c = 0; c < 26; ++c) {
50. if (!trie[c][nd]) {
51. // trie[c][nd] = trie[c][pi[nd]]; //这句话就是trie图优化，这题不用它反而跑得更快……
52. continue;
53. }
54. int son = trie[c][nd], nxt = pi[nd];
55. while (nxt && !trie[c][nxt])
56. nxt = pi[nxt];
57. pi[son] = trie[c][nxt];
58. last[son] = cnt[pi[son]] ? pi[son] : last[pi[son]]; //last优化，它在三道题中都很优秀
59. que.push(son);
60. }
61. }
62. }
63. int Match(char *s) {
64. int len = strlen(s), nd = root, ans = 0;
65. for (int i = 0; i < len; ++i) {
66. int c = s[i] - 'a';
67. while (nd && !trie[c][nd]) { //如果加了trie图优化就不用每次跳fail边来找后缀，因为trie图优化直接记录可匹配的下一个后缀
68. nd = pi[nd];
69. }
70. nd = trie[c][nd];
71. for (int t = nd; t && ~cnt[t]; t = last[t])
72. ans += cnt[t], cnt[t] = -1;
73. }
74. return ans;
75. }
77. } using namespace ACAM;
78. int main() {
79. // freopen("testdata.txt", "r", stdin);
80. // freopen("ans.txt", "w", stdout);
81. int n;
82. scanf("%d", &n);
83. for (int i = 1; i <= n; ++i) {
84. scanf("%s", s);
85. Insert(s);
86. }
87. Build_ACAM();
88. scanf("%s", s);
89. cout << Match(s);
90. return 0;
91. }
模版二：AC自动机（加强版）：多组数据，输出出现最多的串的出现次数，按输入顺序输出这些串。
1. #include <iostream>
2. #include <cstdio>
3. #include <cstring>
4. #include <queue>
5. #include <vector>
6. #define BUG puts("findone")
7. #define maxn 70 * 150 + 10
8. template <typename T>
9. void read(T &x) {
10. x = 0;
11. int f = 1;
12. char ch = getchar();
13. while (!isdigit(ch)) {
14. if (ch == '-')
15. f = -1;
16. ch = getchar();
17. }
18. while (isdigit(ch)) {
19. x = x * 10 + (ch ^ 48);
20. ch = getchar();
21. }
22. x *= f;
23. return;
24. }
25. using namespace std;
26. char s[1000010], T[151][80];
27. namespace ACAM {
28. int trie[26][maxn], pi[maxn], cnt[maxn], last[maxn], sum[maxn], id[maxn];
29. const int root(1);
30. int tot = 1;
31. void Insert(char *s, int pos) {
32. int nd = root, len = strlen(s);
33. for (int i = 0; i < len; ++i) {
34. int c = s[i] - 'a';
35. if (!trie[c][nd])
36. trie[c][nd] = ++tot;
37. nd = trie[c][nd];
38. }
39. ++cnt[nd], id[nd] = pos; //id数组的意义是记录每个节点（状态）在原输入顺序中所对应的串
40. }
41. void Build_ACAM() {
42. for (int i = 0; i < 26; ++i)
43. trie[i][0] = root;
44. pi[root] = 0;
45. queue<int> que;
46. que.push(root);
47. while (!que.empty()) {
48. int nd = que.front();
49. que.pop();
50. for (int c = 0; c < 26; ++c) {
51. if (!trie[c][nd]) {
52. trie[c][nd] = trie[c][pi[nd]];
53. continue;
54. }//优化位置
55. int son = trie[c][nd], nxt = pi[nd];
56. while (nxt && !trie[c][nxt])
57. nxt = pi[nxt];
58. pi[son] = trie[c][nxt];
59. last[son] = cnt[pi[son]] ? pi[son] : last[pi[son]];
60. que.push(son);
61. }
62. }
63. }
64. void Match(char *s) {
65. int len = strlen(s), nd = root;
66. for (int i = 0; i < len; ++i) {
67. int c = s[i] - 'a';
68. // while (nd && !trie[c][nd])
69. // nd = pi[nd];
70. nd = trie[c][nd];
71. for (int t = nd; t; t = last[t])
72. if (cnt[t])
73. ++sum[t];
74. }
75. vector<int> ans;
76. for (int i = 1; i <= tot; ++i)
77. if (ans.empty() || sum[i] == sum[ans.front()])
78. ans.push_back(i);
79. else if (sum[i] > sum[ans.front()]) {
80. ans.clear();
81. ans.push_back(i);
82. }
83. printf("%d ", sum[ans.front()]);
84. for (int i = 0; i < ans.size(); ++i)
85. puts(T[id[ans[i]]]);
86. }
88. } using namespace ACAM;
89. int main() {
90. // freopen("testdata.txt", "r", stdin);
91. // freopen("ans.txt", "w", stdout);
92. ios::sync_with_stdio(0); //某种加快iostream的黑科技据称读入字符串飞快
93. cin.tie(0);
94. while (19260817) {
95. int n;
96. cin >> n;
97. if(n == 0) break;
98. tot = 1;
99. memset(trie, 0, sizeof(trie));
100. memset(sum, 0, sizeof(sum));
101. memset(cnt, 0, sizeof(cnt));
102. memset(pi, 0, sizeof(pi));
103. memset(last, 0, sizeof(last));
104. memset(id, 0, sizeof(id));
105. for (int i = 1; i <= n; ++i) {
106. cin >> T[i];
107. Insert(T[i], i);
108. }
109. Build_ACAM();
110. cin >> s;
111. Match(s);
112. }
113. return 0;
114. }
模版三、AC自动机（二次加强版）：统计每个模式串出现的次数。一开始的策略是每到一个位置就暴力跳last边来找后缀，但是时间只有1000ms，T掉了几个点。参看题解给出的解法是：统计每个状态的出现次数，然后从fail[u]向u连边，构成一棵树。这棵树被称作fail树，满足每个节点的祖先都是它的后缀。这样，每个模式串的出现次数就是它自己的出现次数+以它为后缀的串的出现次数，也就是以它为根的子树的大小。trie树上某状态的祖先则是它的前缀。fail树的性质很好，也具有广泛的应用。
1. #include <iostream>
2. #include <cstdio>
3. #include <cstring>
4. #include <queue>
5. #define maxs 2000010
6. #define maxn 200010
7. using namespace std;
8. char T[maxn], s[maxs];
9. int head[maxn], top;
10. struct E {
11. int to, nxt;
12. } edge[maxn];
13. void Insert_edge(int u, int v) {
14. edge[++top] = (E) {v, head[u]};
15. head[u] = top;
16. }
18. namespace ACAM {
19. int trie[26][maxn], tot = 1, cnt[maxn], pi[maxn], last[maxn], end[maxn];
20. int id[maxn];
21. const int root(1);
22. void Insert(char *s, int k) {
23. int nd = root, len = strlen(s);
24. for (int i = 0; i < len; ++i) {
25. int c = s[i] - 'a';
26. if (!trie[c][nd])
27. trie[c][nd] = ++tot;
28. nd = trie[c][nd];
29. }
30. ++end[nd];
31. id[k] = nd;
32. }
33. void Build() {
34. for (int c = 0; c < 26; ++c)
35. trie[c][0] = root;
36. queue<int> que;
37. que.push(root);
38. while (!que.empty()) {
39. int nd = que.front(); que.pop();
40. for (int c = 0; c < 26; ++c) {
41. int son = trie[c][nd];
42. if (!son) {
43. trie[c][nd] = trie[c][pi[nd]];
44. continue;
45. }
46. int nxt = pi[nd];
47. while (nxt && !trie[c][nxt])
48. nxt = pi[nxt];
49. pi[son] = trie[c][nxt];
50. last[son] = end[pi[son]] ? pi[son] : last[pi[son]];
51. que.push(son);
52. }
53. }
54. }
55. void dfs(int u) {
56. for (int i = head[u]; i; i = edge[i].nxt) {
57. int v = edge[i].to;
58. dfs(v);
59. cnt[u] += cnt[v];
60. }
61. }
62. void Match(char *s) {
63. register int nd = root; int len = strlen(s);
64. for (int i = 0; i < len; ++i) {
65. int c = s[i] - 'a';
66. nd = trie[c][nd];
67. ++cnt[nd];
68. }
69. for (int i = 2; i <= tot; ++i)
70. Insert_edge(pi[i], i); //建fail树
71. dfs(root); //统计子树大小
72. }
73. } using namespace ACAM;
74. int main() {
75. ios::sync_with_stdio(0);
76. cin.tie(0);
77. int n;
78. cin >> n;
79. for (int i = 1; i <= n; ++i) {
80. cin >> T;
81. Insert(T, i);
82. }
83. Build();
84. cin >> s;
85. Match(s);
86. for (int i = 1; i <= n; ++i)
87. printf("%d ", cnt[id[i]]);
88. return 0;
89. }
相关阅读:
值类型引用类型的区别(转)
聚集索引和非聚集索引(转)
使用docker安装nginx并配置端口转发
 jenkins选择分支构建
 创建好docker后不能apt-get update解决方法
 Docker开启Remote API 访问 2375端口
 Docker加速配置
 Docker下载安装
 进入Docker 容器 docker exec [CONTAINER ID] bin/bash报错问题
 定时删除日志文件---linux定时清理日志
原文地址：https://www.cnblogs.com/TY02/p/11185917.html