• 不同关键词查找方法性能比较


    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # author:ShidongDu time:2020/6/3
    import time
    import pandas as pd
    import re
    
    # 结点类
    class node:
        def __init__(self, ch):
            self.ch = ch  # 结点值
            self.fail = None  # Fail指针
            self.tail = 0  # 尾标志:标志为 i 表示第 i 个模式串串尾
            self.child = []  # 子结点
            self.childvalue = []  # 子结点的值
    
    # AC自动机类
    class Aho_Corasick:
        def __init__(self):
            self.root = node("")  # 初始化根结点
            self.count = 0  # 模式串个数
    
        # 第一步:模式串建树
        def insert(self, strkey):
            self.count += 1  # 插入模式串,模式串数量加一
            p = self.root
            for i in strkey:
                if i not in p.childvalue:  # 若字符不存在,添加子结点
                    child = node(i)
                    p.child.append(child)
                    p.childvalue.append(i)
                    p = child
                else:  # 否则,转到子结点
                    p = p.child[p.childvalue.index(i)]
            p.tail = self.count  # 修改尾标志
    
        # 第二步:修改Fail指针
        def ac_automation(self):
            queuelist = [self.root]  # 用列表代替队列
            while len(queuelist):  # BFS遍历字典树
                temp = queuelist[0]
                queuelist.remove(temp)  # 取出队首元素
                for i in temp.child:
                    if temp == self.root:  # 根的子结点Fail指向根自己
                        i.fail = self.root
                    else:
                        p = temp.fail  # 转到Fail指针
                        while p:
                            if i.ch in p.childvalue:  # 若结点值在该结点的子结点中,则将Fail指向该结点的对应子结点
                                i.fail = p.child[p.childvalue.index(i.ch)]
                                break
                            p = p.fail  # 否则,转到Fail指针继续回溯
                        if not p:  # 若p==None,表示当前结点值在之前都没出现过,则其Fail指向根结点
                            i.fail = self.root
                    queuelist.append(i)  # 将当前结点的所有子结点加到队列中
    
        # 第三步:模式匹配
        def runkmp(self, strmode):
            p = self.root
            cnt = {}  # 使用字典记录成功匹配的状态
            for i in strmode:  # 遍历目标串
                while i not in p.childvalue and p is not self.root:
                    p = p.fail
                if i in p.childvalue:  # 若找到匹配成功的字符结点,则指向那个结点,否则指向根结点
                    p = p.child[p.childvalue.index(i)]
                else:
                    p = self.root
                temp = p
                while temp is not self.root:
                    if temp.tail:  # 尾标志为0不处理
                        if temp.tail not in cnt:
                            cnt.setdefault(temp.tail)
                            cnt[temp.tail] = 1
                        else:
                            cnt[temp.tail] += 1
                    temp = temp.fail
            return cnt  # 返回匹配状态
            # 如果只需要知道是否匹配成功,则return bool(cnt)即可
            # 如果需要知道成功匹配的模式串种数,则return len(cnt)即可
    
    class Trie:
        # word_end = -1
    
        def __init__(self):
            """
            Initialize your data structure here.
            """
            self.name = 'Tire'
            self.root = {}
            self.word_end = -1
    
        def insert(self, word):
            """
            Inserts a word into the trie.
            :type word: str
            :rtype: void
            """
            curNode = self.root
            for c in word:
                if not c in curNode:
                    curNode[c] = {}
                curNode = curNode[c]
    
            curNode[self.word_end] = True
    
        def search(self, word):
            """
            Returns if the word is in the trie.
            :type word: str
            :rtype: bool
            """
            curNode = self.root
            for c in word:
                if not c in curNode:
                    return False
                curNode = curNode[c]
    
            # Doesn't end here
            if self.word_end not in curNode:
                return False
    
            return True
    
        def startsWith(self, prefix):
            """
            Returns if there is any word in the trie that starts with the given prefix.
            :type prefix: str
            :rtype: bool
            """
            curNode = self.root
            for c in prefix:
                if not c in curNode:
                    return False
                curNode = curNode[c]
    
            return True
    
    
    class Solution:
        def __init__(self, dict_file: str, besides=None):
    
            self.besides = besides
            self.key_word_list = []
            self.a_dict = self.read_xls(dict_file)
    
        def read_xls(self, file) -> dict:
            a_dict = {}
    
            a = pd.read_excel(file)
            category = {}
            for cate in a.keys():
                if 'Unnamed' not in cate:
                    category[cate] = set()
    
            for _ in category.keys():
                for __ in a[_]:
                    category[_].add(__)
    
            for key in a.keys():
                if 'Unnamed' not in key:
                    a_dict[key] = []
                    for word in a[key]:
                        if type(word) == type('str'):
                            self.key_word_list.append(word)
                            a_dict[key].append(word)
                            if self.besides:
                                self.besides.insert(word)
                        else:
                            break
            return a_dict
    
    ####################################################################
        def BF(self, word, doc):
            res = []
            length = len(word)
            for i in range(len(doc)-length):
                if doc[i: i+length] == word:
                   res.append(word)
            return (word, len(res)) if res else None
    
    ####################################################################
        def KMP(self, word: str, doc: str):
            res  =[]
            def get_next(word: str):
                n = len(word)
                next = [0 for _ in range(n)]
                next[0] = -1
    
                j = -1
                for i in range(1, n):
                    while (j != -1 and word[i] != word[j + 1]):
                        j = next[j]
                    if word[i] == word[j + 1]: j += 1
                    next[i] = j
                return next
    
            next = get_next(word)
            m = len(doc)
            n = len(word)
            j = -1
            for i in range(m):
                while(j!=-1 and doc[i] != word[j+1]):
                    j = next[j]
                if doc[i]==word[j+1]:
                    j+=1
                if j == n-1:
                    res.append(word)
                    j = next[j]
            return (word, len(res)) if res else None
    
    ####################################################################
        def Re(self, word: str, doc: str):
            res = re.search(word, doc)
            return (word, 1) if res else None
    
    ####################################################################
        def Tire_Tree(self, doc: str):
            res = []
            for i in range(len(doc)):
                if doc[i] not in self.besides.root:
                    continue
                else:
                    tmp = self.besides.root[doc[i]]
                    j = i+1
                    while j <= len(doc)-1:
                        if doc[j] in tmp:
                            if -1 in tmp[doc[j]] :
                                res.append(doc[i: j+1])
                                break
                            else:
                                tmp = tmp[doc[j]]
                                j += 1
                        else:
                            break
            if res:
                return res
    
    ####################################################################
        def Aho_Corasick(self, doc: str):
            res = []
            d = self.besides.runkmp(doc)
            for key in d.keys():
                res.append( (self.key_word_list[key], d[key]) )
            return res
    
    ####################################################################
    
    
        def operation(self, algorithm: str, file_name: str):
            res = []
            if algorithm == 'BF':
                algo = self.BF
            if algorithm == 'KMP':
                algo = self.KMP
            if algorithm == 'Re':
                algo = self.Re
            if algorithm == 'Tire':
                algo = self.Tire_Tree
                with open(file_name, 'r', encoding='utf-8') as f:
                    textlines = f.readlines()
                    for text in textlines:
                        word_pos = []
                        word_pos.append( (algo(text), text) )
                        res.append((word_pos, text))
                return res
    
            if algorithm == 'Aho_Corasick':
                algo = self.Aho_Corasick
                self.besides.ac_automation()
                with open(file_name, 'r', encoding='utf-8') as f:
                    textlines = f.readlines()
                    for text in textlines:
                        word_nums = []
                        word_nums.append( (algo(text), text) )
                        res.append((word_nums, text))
                return res
    
            with open(file_name, 'r', encoding='utf-8') as f:
                textlines = f.readlines()
                for text in textlines:
                    word_pos = []
                    for key in self.a_dict.keys():
                        for word in self.a_dict[key]:
                            tmp = algo(word, text)
                            if tmp:
                                word_pos.append((tmp, key))
                    res.append( (word_pos, text) )
            return res
    
    
    
    if __name__ == '__main__':
        time1 = time.time()
        tire = Trie()
        aho_corasick = Aho_Corasick()
        solution = Solution('key_word-update.xlsx', aho_corasick)
        res  = solution.operation('Aho_Corasick', 'all.txt')
        # solution = Solution('key_word-update.xlsx')
        # res = solution.operation('Re', 'all.txt')
        with open('res.txt', 'w', encoding='utf-8') as f:
            for _ in res:
                f.write(str(_) + '
    ')
        time2 = time.time()
        print(time2 - time1)
  • 相关阅读:
    Ubuntu18.04下的音频录制和编辑软件Ardour及QjackCtl(jackd gui)
    Adobe After Effects CS6 操作记录
    编译安装和apt安装Nginx1.14.0
    Centos7.4和Ubuntu18.04安装PHP7.2
    Ubuntu与Windows7双系统下, 系统时间不一致的问题
    关于Thinkpad的立体声麦克风输入
    坑人的小米净水器: 漏水, 废水堵塞, 费用陷阱
    Photoshop CS6 操作记录
    Win7 64bit下值得推荐的免费看图软件
    Redis C客户端Hiredis代码分析
  • 原文地址:https://www.cnblogs.com/demo-deng/p/13093190.html
Copyright © 2020-2023  润新知