rating_data_raw = sc.textFile("%s/ml-100k/u.data" % PATH) print rating_data_raw.first() num_ratings = rating_data_raw.count() print "Ratings: %d" % num_ratings # In[35]: rating_data = rating_data_raw.map(lambda line: line.split(" ")) ratings = rating_data.map(lambda fields: int(fields[2])) max_rating = ratings.reduce(lambda x, y: max(x, y)) min_rating = ratings.reduce(lambda x, y: min(x, y)) mean_rating = ratings.reduce(lambda x, y: x + y) / float(num_ratings) median_rating = np.median(ratings.collect()) ratings_per_user = num_ratings / num_users ratings_per_movie = num_ratings / num_movies print "Min rating: %d" % min_rating print "Max rating: %d" % max_rating print "Average rating: %2.2f" % mean_rating print "Median rating: %d" % median_rating print "Average # of ratings per user: %2.2f" % ratings_per_user print "Average # of ratings per movie: %2.2f" % ratings_per_movie # In[36]: # we can also use the stats function to get some similar information to the above ratings.stats()
上面是粗暴的做法
简单的做法:
>>> all_data = sc.parallelize([1,2,3,4,5,6,7,8,100]) >>> all_data.mean() 15.11111111111111 >>> all_data.max() 100 >>> all_data.min() 1 >>> all_data.median() Traceback (most recent call last): File "<stdin>", line 1, in <module> AttributeError: 'RDD' object has no attribute 'median' >>> all_data.stats() (count: 9, mean: 15.1111111111, stdev: 30.0903987804, max: 100.0, min: 1.0)