以下代码用python编写。
import os import os.path import operator import heapq """ sort users' queries by frequency 1. hashing queries and dividing into 10 files. (hash(query)%10) 2. counting the number queries and sorting in each file using hashtable. 3. merging files using heap queue algorithm. """ datadir = "d:/querysort/data/" tempdir = "d:/querysort/temp/" destfile = "d:/querysort/sorted.txt" def hashfiles(): fs = [] if not os.path.exists(tempdir): os.makedirs(tempdir) for f in range(0,10): fs.append(open(tempdir + str(f), 'w')) for parent, dirnames, filenames in os.walk(datadir): for filename in filenames: f = open(os.path.join(parent, filename),'r') for query in f: fs[hash(query)%10].write(query) f.close() for f in fs: f.close() def sortqueryinfile(): fs = [] if not os.path.exists(tempdir): return for f in range(0,10): fs.append(open(tempdir + str(f), 'r+')) for f in fs: D = {} for query in f: if query in D: D[query] += 1 else: D[query] = 1 sorted_D = sorted(D.iteritems(), key=operator.itemgetter(1), reverse=True) f.seek(0,0) f.truncate() for item in sorted_D: f.write(str(item[1]) + "\t" + item[0]) f.close() def decorated_file(f): """ Yields an easily sortable tuple. """ for line in f: count, query = line.split('\t',2) yield (-int(count), query) def mergefiles(): fs = [] if not os.path.exists(tempdir): return for f in range(0,10): fs.append(open(tempdir + str(f), 'r+')) f_dest = open(destfile,"w") lines_written = 0 for line in heapq.merge(*[decorated_file(f) for f in fs]): f_dest.write(line[1]) lines_written += 1 return lines_written if __name__ == '__main__': hashfiles() sortqueryinfile() print "sorting completed, total queries: ", mergefiles()