#!/usr/bin/env python # -*- coding: utf-8 -*- # author:ShidongDu time:2020/6/3 import time import pandas as pd import re # 结点类 class node: def __init__(self, ch): self.ch = ch # 结点值 self.fail = None # Fail指针 self.tail = 0 # 尾标志:标志为 i 表示第 i 个模式串串尾 self.child = [] # 子结点 self.childvalue = [] # 子结点的值 # AC自动机类 class Aho_Corasick: def __init__(self): self.root = node("") # 初始化根结点 self.count = 0 # 模式串个数 # 第一步:模式串建树 def insert(self, strkey): self.count += 1 # 插入模式串,模式串数量加一 p = self.root for i in strkey: if i not in p.childvalue: # 若字符不存在,添加子结点 child = node(i) p.child.append(child) p.childvalue.append(i) p = child else: # 否则,转到子结点 p = p.child[p.childvalue.index(i)] p.tail = self.count # 修改尾标志 # 第二步:修改Fail指针 def ac_automation(self): queuelist = [self.root] # 用列表代替队列 while len(queuelist): # BFS遍历字典树 temp = queuelist[0] queuelist.remove(temp) # 取出队首元素 for i in temp.child: if temp == self.root: # 根的子结点Fail指向根自己 i.fail = self.root else: p = temp.fail # 转到Fail指针 while p: if i.ch in p.childvalue: # 若结点值在该结点的子结点中,则将Fail指向该结点的对应子结点 i.fail = p.child[p.childvalue.index(i.ch)] break p = p.fail # 否则,转到Fail指针继续回溯 if not p: # 若p==None,表示当前结点值在之前都没出现过,则其Fail指向根结点 i.fail = self.root queuelist.append(i) # 将当前结点的所有子结点加到队列中 # 第三步:模式匹配 def runkmp(self, strmode): p = self.root cnt = {} # 使用字典记录成功匹配的状态 for i in strmode: # 遍历目标串 while i not in p.childvalue and p is not self.root: p = p.fail if i in p.childvalue: # 若找到匹配成功的字符结点,则指向那个结点,否则指向根结点 p = p.child[p.childvalue.index(i)] else: p = self.root temp = p while temp is not self.root: if temp.tail: # 尾标志为0不处理 if temp.tail not in cnt: cnt.setdefault(temp.tail) cnt[temp.tail] = 1 else: cnt[temp.tail] += 1 temp = temp.fail return cnt # 返回匹配状态 # 如果只需要知道是否匹配成功,则return bool(cnt)即可 # 如果需要知道成功匹配的模式串种数,则return len(cnt)即可 class Trie: # word_end = -1 def __init__(self): """ Initialize your data structure here. """ self.name = 'Tire' self.root = {} self.word_end = -1 def insert(self, word): """ Inserts a word into the trie. :type word: str :rtype: void """ curNode = self.root for c in word: if not c in curNode: curNode[c] = {} curNode = curNode[c] curNode[self.word_end] = True def search(self, word): """ Returns if the word is in the trie. :type word: str :rtype: bool """ curNode = self.root for c in word: if not c in curNode: return False curNode = curNode[c] # Doesn't end here if self.word_end not in curNode: return False return True def startsWith(self, prefix): """ Returns if there is any word in the trie that starts with the given prefix. :type prefix: str :rtype: bool """ curNode = self.root for c in prefix: if not c in curNode: return False curNode = curNode[c] return True class Solution: def __init__(self, dict_file: str, besides=None): self.besides = besides self.key_word_list = [] self.a_dict = self.read_xls(dict_file) def read_xls(self, file) -> dict: a_dict = {} a = pd.read_excel(file) category = {} for cate in a.keys(): if 'Unnamed' not in cate: category[cate] = set() for _ in category.keys(): for __ in a[_]: category[_].add(__) for key in a.keys(): if 'Unnamed' not in key: a_dict[key] = [] for word in a[key]: if type(word) == type('str'): self.key_word_list.append(word) a_dict[key].append(word) if self.besides: self.besides.insert(word) else: break return a_dict #################################################################### def BF(self, word, doc): res = [] length = len(word) for i in range(len(doc)-length): if doc[i: i+length] == word: res.append(word) return (word, len(res)) if res else None #################################################################### def KMP(self, word: str, doc: str): res =[] def get_next(word: str): n = len(word) next = [0 for _ in range(n)] next[0] = -1 j = -1 for i in range(1, n): while (j != -1 and word[i] != word[j + 1]): j = next[j] if word[i] == word[j + 1]: j += 1 next[i] = j return next next = get_next(word) m = len(doc) n = len(word) j = -1 for i in range(m): while(j!=-1 and doc[i] != word[j+1]): j = next[j] if doc[i]==word[j+1]: j+=1 if j == n-1: res.append(word) j = next[j] return (word, len(res)) if res else None #################################################################### def Re(self, word: str, doc: str): res = re.search(word, doc) return (word, 1) if res else None #################################################################### def Tire_Tree(self, doc: str): res = [] for i in range(len(doc)): if doc[i] not in self.besides.root: continue else: tmp = self.besides.root[doc[i]] j = i+1 while j <= len(doc)-1: if doc[j] in tmp: if -1 in tmp[doc[j]] : res.append(doc[i: j+1]) break else: tmp = tmp[doc[j]] j += 1 else: break if res: return res #################################################################### def Aho_Corasick(self, doc: str): res = [] d = self.besides.runkmp(doc) for key in d.keys(): res.append( (self.key_word_list[key], d[key]) ) return res #################################################################### def operation(self, algorithm: str, file_name: str): res = [] if algorithm == 'BF': algo = self.BF if algorithm == 'KMP': algo = self.KMP if algorithm == 'Re': algo = self.Re if algorithm == 'Tire': algo = self.Tire_Tree with open(file_name, 'r', encoding='utf-8') as f: textlines = f.readlines() for text in textlines: word_pos = [] word_pos.append( (algo(text), text) ) res.append((word_pos, text)) return res if algorithm == 'Aho_Corasick': algo = self.Aho_Corasick self.besides.ac_automation() with open(file_name, 'r', encoding='utf-8') as f: textlines = f.readlines() for text in textlines: word_nums = [] word_nums.append( (algo(text), text) ) res.append((word_nums, text)) return res with open(file_name, 'r', encoding='utf-8') as f: textlines = f.readlines() for text in textlines: word_pos = [] for key in self.a_dict.keys(): for word in self.a_dict[key]: tmp = algo(word, text) if tmp: word_pos.append((tmp, key)) res.append( (word_pos, text) ) return res if __name__ == '__main__': time1 = time.time() tire = Trie() aho_corasick = Aho_Corasick() solution = Solution('key_word-update.xlsx', aho_corasick) res = solution.operation('Aho_Corasick', 'all.txt') # solution = Solution('key_word-update.xlsx') # res = solution.operation('Re', 'all.txt') with open('res.txt', 'w', encoding='utf-8') as f: for _ in res: f.write(str(_) + ' ') time2 = time.time() print(time2 - time1)