• python 遍历hadoop, 跟指定列表对比 包含列表中值的取出。


    import sys
    import tstree
    
    fname = 'high_freq_site.list'
    tree = tstree.TernarySearchTrie()
    tree.loadData(fname)
    
    token = ''
    counter = 0
    post = []
    
    # url, count, posttime
    for line in sys.stdin:
        line = line.strip()
        arr = line.split()
        if len(arr) != 3:
            continue
        
        #print arr
        num = arr[1]
        url = arr[0]
        posttime = int(arr[2])
    
        if token == '':
            token = url
            counter = 0
            counter += int(num)
            post.append(posttime)
        elif token == url:
            counter += int(num)
            post.append(posttime)
        elif token != url:
            ret = tree.maxMatch(token)
            if ret and post:
                print '%s	%s	%s	%s' % (ret, token, counter, min(post))
            
            token = url
            counter = 0
            counter += int(num)
            post = []
    
    ret = tree.maxMatch(token)
    if ret and post:
        print '%s	%s	%s	%s' % (ret, token, counter, min(post))
    
    
    
    class TSTNode(object):
        def __init__(self, splitchar):
            self.splitchar = splitchar
            self.data = None
    
            self.loNode = None
            self.eqNode = None
            self.hiNode = None
    
    
    class TernarySearchTrie(object):
        def __init__(self):
            self.rootNode = None
    
    
        def loadData(self, fname):
            f = open(fname)
            while True:
                line = f.readline()
                if not line:
                    break
                line = line.strip()
                node = self.addWord(line)
                if node:
                    node.data = line
            f.close()
    
        
        def addWord(self, word):
            if not word:
                return None
    
            charIndex = 0
            if not self.rootNode:
                self.rootNode = TSTNode(word[0])
    
            currentNode = self.rootNode
    
            while True:
                charComp = ord(word[charIndex]) - ord(currentNode.splitchar)
                if charComp == 0:
                    charIndex += 1
                    if charIndex == len(word):
                        return currentNode
                    if not currentNode.eqNode:
                        currentNode.eqNode = TSTNode(word[charIndex])
                    currentNode = currentNode.eqNode
                elif charComp < 0:
                    if not currentNode.loNode:
                        currentNode.loNode = TSTNode(word[charIndex])
                    currentNode = currentNode.loNode
                else:
                    if not currentNode.hiNode:
                        currentNode.hiNode = TSTNode(word[charIndex])
                    currentNode = currentNode.hiNode
    
    
        def maxMatch(self, url):
            ret = None
            currentNode = self.rootNode
            charIndex = 0
            while currentNode:
                if charIndex >= len(url):
                    break
                charComp = ord(url[charIndex]) - ord(currentNode.splitchar)
                if charComp == 0:
                    charIndex += 1
                    if currentNode.data:
                        ret = currentNode.data
                    if charIndex == len(url):
                        return ret
                    currentNode = currentNode.eqNode
                elif charComp < 0:
                    currentNode = currentNode.loNode
                else:
                    currentNode = currentNode.hiNode
            return ret
    
    
    if __name__ == '__main__':
        import sys
        fname = 'high_freq_site.list'
        tree = TernarySearchTrie()
        tree.loadData(fname)
    
        for url in sys.stdin:
            url = url.strip()
            ret = tree.maxMatch(url)
            print ret
  • 相关阅读:
    2.15 STL复习
    20190214Test(栈与队列)
    STL列表链式前向星
    链式前向星(邻接表)
    Priority_queue详解
    List详解
    NOIP2019计划
    第二章笔记
    第一章笔记
    本地文件上传GitHub
  • 原文地址:https://www.cnblogs.com/i80386/p/5058584.html
Copyright © 2020-2023  润新知