Part I:词频统计并返回topN
统计的文本数据:
what do you do how do you do how do you do how are you
from operator import add from pyspark import SparkContext def sort_t(): sc = SparkContext(appName="testWC") data = sc.parallelize(["what do you do", "how do you do", "how do you do", "how are you"]) result = data.flatMap(lambda x: x.split(" ")) .map(lambda x: (x, 1)). reduceByKey(add). sortBy(lambda x: x[1], False).take(3) for k, v in result: print k, v if __name__ == '__main__': sort_t()
Part II:调用排序算法并返回topN
样本数据 numbers_data.txt:
15561 112 -40 51467112 234 8561 112 -34 53467111 121 2345 789 34 14561 -21 12112 101 100 -4 23 51467111 2434 15567 132 -14 51467111 237
from pyspark import SparkContext def solve(): sc = SparkContext(appName="Sort_test_example") lines = sc.textFile("../input/numbers_data.txt") results = lines.flatMap(lambda x: x.split(" ")) .map(lambda x: (int(x), 1)) .sortByKey(ascending=False).take(3) output = results for (key, value) in output: print key print key if __name__ == '__main__':
solve()
注:若出现并列时,返回多个并列的数