python 遍历hadoop，跟指定列表对比包含列表中值的取出。

import sys
import tstree

fname = 'high_freq_site.list'
tree = tstree.TernarySearchTrie()
tree.loadData(fname)

token = ''
counter = 0
post = []

# url, count, posttime
for line in sys.stdin:
    line = line.strip()
    arr = line.split()
    if len(arr) != 3:
        continue
    
    #print arr
    num = arr[1]
    url = arr[0]
    posttime = int(arr[2])

    if token == '':
        token = url
        counter = 0
        counter += int(num)
        post.append(posttime)
    elif token == url:
        counter += int(num)
        post.append(posttime)
    elif token != url:
        ret = tree.maxMatch(token)
        if ret and post:
            print '%s	%s	%s	%s' % (ret, token, counter, min(post))
        
        token = url
        counter = 0
        counter += int(num)
        post = []

ret = tree.maxMatch(token)
if ret and post:
    print '%s	%s	%s	%s' % (ret, token, counter, min(post))



class TSTNode(object):
    def __init__(self, splitchar):
        self.splitchar = splitchar
        self.data = None

        self.loNode = None
        self.eqNode = None
        self.hiNode = None


class TernarySearchTrie(object):
    def __init__(self):
        self.rootNode = None


    def loadData(self, fname):
        f = open(fname)
        while True:
            line = f.readline()
            if not line:
                break
            line = line.strip()
            node = self.addWord(line)
            if node:
                node.data = line
        f.close()

    
    def addWord(self, word):
        if not word:
            return None

        charIndex = 0
        if not self.rootNode:
            self.rootNode = TSTNode(word[0])

        currentNode = self.rootNode

        while True:
            charComp = ord(word[charIndex]) - ord(currentNode.splitchar)
            if charComp == 0:
                charIndex += 1
                if charIndex == len(word):
                    return currentNode
                if not currentNode.eqNode:
                    currentNode.eqNode = TSTNode(word[charIndex])
                currentNode = currentNode.eqNode
            elif charComp < 0:
                if not currentNode.loNode:
                    currentNode.loNode = TSTNode(word[charIndex])
                currentNode = currentNode.loNode
            else:
                if not currentNode.hiNode:
                    currentNode.hiNode = TSTNode(word[charIndex])
                currentNode = currentNode.hiNode


    def maxMatch(self, url):
        ret = None
        currentNode = self.rootNode
        charIndex = 0
        while currentNode:
            if charIndex >= len(url):
                break
            charComp = ord(url[charIndex]) - ord(currentNode.splitchar)
            if charComp == 0:
                charIndex += 1
                if currentNode.data:
                    ret = currentNode.data
                if charIndex == len(url):
                    return ret
                currentNode = currentNode.eqNode
            elif charComp < 0:
                currentNode = currentNode.loNode
            else:
                currentNode = currentNode.hiNode
        return ret


if __name__ == '__main__':
    import sys
    fname = 'high_freq_site.list'
    tree = TernarySearchTrie()
    tree.loadData(fname)

    for url in sys.stdin:
        url = url.strip()
        ret = tree.maxMatch(url)
        print ret

相关阅读:
2.15 STL复习
 20190214Test（栈与队列）
STL列表链式前向星
 链式前向星（邻接表）
Priority_queue详解
 List详解
 NOIP2019计划
 第二章笔记
 第一章笔记
 本地文件上传GitHub
原文地址：https://www.cnblogs.com/i80386/p/5058584.html

最新文章
树
 BT
队列
 栈
 背包问题
 动态规划的基本模型
 广搜
 贪心
 分治
 深搜

热门文章
递归
 递推
 排序
 高精
 mod程序编写
 对拍程序编写
 emacs的基本配置
 贪心
 数据排序
 高精度计算

python 遍历hadoop， 跟指定列表对比 包含列表中值的取出。

python 遍历hadoop，跟指定列表对比包含列表中值的取出。