一、基于排序机制的wordcount程序
1、要求
1、对文本文件内的每个单词都统计出其出现的次数。
2、按照每个单词出现次数的数量,降序排序。
2、代码实现
------java实现------- package cn.spark.study.core; import java.util.Arrays; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; public class SortWordCount { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("SortWordCount").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("D:\test-file\spark.txt"); JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() { private static final long serialVersionUID = 1L; @Override public Iterable<String> call(String t) throws Exception { return Arrays.asList(t.split(" ")); } }); JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Integer> call(String t) throws Exception { return new Tuple2<String, Integer>(t, 1); } }); JavaPairRDD<String, Integer> wordCounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() { private static final long serialVersionUID = 1L; @Override public Integer call(Integer v1, Integer v2) throws Exception { return v1 + v2; } }); // 到这里为止,就得到了每个单词出现的次数 // 但是,问题是,我们的新需求,是要按照每个单词出现次数的顺序,降序排序 // wordCounts RDD内的元素是什么?应该是这种格式的吧:(hello, 3) (you, 2) // 我们需要将RDD转换成(3, hello) (2, you)的这种格式,才能根据单词出现次数进行排序把! // 进行key-value的反转映射 JavaPairRDD<Integer, String> countWords = wordCounts.mapToPair(new PairFunction<Tuple2<String,Integer>, Integer, String>() { private static final long serialVersionUID = 1L; @Override public Tuple2<Integer, String> call(Tuple2<String, Integer> t) throws Exception { return new Tuple2<Integer, String>(t._2, t._1); } }); //按照key进行排序 JavaPairRDD<Integer, String> sortedCountWords = countWords.sortByKey(false); //再次将value-key进行反转映射 JavaPairRDD<String, Integer> sortedWordCounts = sortedCountWords.mapToPair(new PairFunction<Tuple2<Integer,String>, String, Integer>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Integer> call(Tuple2<Integer, String> t) throws Exception { return new Tuple2<String, Integer>(t._2, t._1); } }); // 到此为止,我们获得了按照单词出现次数排序后的单词计数 // 打印出来 sortedWordCounts.foreach(new VoidFunction<Tuple2<String,Integer>>() { private static final long serialVersionUID = 1L; @Override public void call(Tuple2<String, Integer> t) throws Exception { System.out.println(t._1 + " appears " + t._2 + " times."); } }); sc.close(); } } ---------scala实现--------- package cn.spark.study.core import org.apache.spark.SparkConf import org.apache.spark.SparkContext /** * @author Administrator */ object SortWordCount { def main(args: Array[String]) { val conf = new SparkConf() .setAppName("SortWordCount") .setMaster("local") val sc = new SparkContext(conf) val lines = sc.textFile("D:\test-file\spark.txt", 1) val words = lines.flatMap { line => line.split(" ") } val pairs = words.map { word => (word, 1) } val wordCounts = pairs.reduceByKey(_ + _) val countWords = wordCounts.map(wordCount => (wordCount._2, wordCount._1)) val sortedCountWords = countWords.sortByKey(false) val sortedWordCounts = sortedCountWords.map(sortedCountWord => (sortedCountWord._2, sortedCountWord._1)) sortedWordCounts.foreach(sortedWordCount => println( sortedWordCount._1 + " appear " + sortedWordCount._2 + " times.")) } }
二、二次排序
1、要求
1、按照文件中的第一列排序。
2、如果第一列相同,则按照第二列排序。
2、java代码
###SecondarySortKey package cn.spark.study.core; import java.io.Serializable; import scala.math.Ordered; /** * 自定义的二次排序key * @author Administrator * */ public class SecondarySortKey implements Ordered<SecondarySortKey>, Serializable { private static final long serialVersionUID = -2366006422945129991L; // 首先在自定义key里面,定义需要进行排序的列 private int first; private int second; public SecondarySortKey(int first, int second) { this.first = first; this.second = second; } @Override public boolean $greater(SecondarySortKey other) { if(this.first > other.getFirst()) { return true; } else if(this.first == other.getFirst() && this.second > other.getSecond()) { return true; } return false; } @Override public boolean $greater$eq(SecondarySortKey other) { if(this.$greater(other)) { return true; } else if(this.first == other.getFirst() && this.second == other.getSecond()) { return true; } return false; } @Override public boolean $less(SecondarySortKey other) { if(this.first < other.getFirst()) { return true; } else if(this.first == other.getFirst() && this.second < other.getSecond()) { return true; } return false; } @Override public boolean $less$eq(SecondarySortKey other) { if(this.$less(other)) { return true; } else if(this.first == other.getFirst() && this.second == other.getSecond()) { return true; } return false; } @Override public int compare(SecondarySortKey other) { if(this.first - other.getFirst() != 0) { return this.first - other.getFirst(); } else { return this.second - other.getSecond(); } } @Override public int compareTo(SecondarySortKey other) { if(this.first - other.getFirst() != 0) { return this.first - other.getFirst(); } else { return this.second - other.getSecond(); } } // 为要进行排序的多个列,提供getter和setter方法,以及hashcode和equals方法 public int getFirst() { return first; } public void setFirst(int first) { this.first = first; } public int getSecond() { return second; } public void setSecond(int second) { this.second = second; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + first; result = prime * result + second; return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; SecondarySortKey other = (SecondarySortKey) obj; if (first != other.first) return false; if (second != other.second) return false; return true; } } ###SecondarySort package cn.spark.study.core; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; /** * 二次排序 * 1、实现自定义的key,要实现Ordered接口和Serializable接口,在key中实现自己对多个列的排序算法 * 2、将包含文本的RDD,映射成key为自定义key,value为文本的JavaPairRDD * 3、使用sortByKey算子按照自定义的key进行排序 * 4、再次映射,剔除自定义的key,只保留文本行 * @author Administrator * */ public class SecondarySort { public static void main(String[] args) { SparkConf conf = new SparkConf() .setAppName("SecondarySort") .setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("D:\test-file\sort.txt"); JavaPairRDD<SecondarySortKey, String> pairs = lines.mapToPair( new PairFunction<String, SecondarySortKey, String>() { private static final long serialVersionUID = 1L; @Override public Tuple2<SecondarySortKey, String> call(String line) throws Exception { String[] lineSplited = line.split(" "); SecondarySortKey key = new SecondarySortKey( Integer.valueOf(lineSplited[0]), Integer.valueOf(lineSplited[1])); return new Tuple2<SecondarySortKey, String>(key, line); } }); JavaPairRDD<SecondarySortKey, String> sortedPairs = pairs.sortByKey(); JavaRDD<String> sortedLines = sortedPairs.map( new Function<Tuple2<SecondarySortKey,String>, String>() { private static final long serialVersionUID = 1L; @Override public String call(Tuple2<SecondarySortKey, String> v1) throws Exception { return v1._2; } }); sortedLines.foreach(new VoidFunction<String>() { private static final long serialVersionUID = 1L; @Override public void call(String t) throws Exception { System.out.println(t); } }); sc.close(); } }
3、scala代码
###SecondSortKey package cn.spark.study.core /** * @author Administrator */ class SecondSortKey(val first: Int, val second: Int) extends Ordered[SecondSortKey] with Serializable { def compare(that: SecondSortKey): Int = { if(this.first - that.first != 0) { this.first - that.first } else { this.second - that.second } } } ###SecondSort package cn.spark.study.core import org.apache.spark.SparkConf import org.apache.spark.SparkContext /** * @author Administrator */ object SecondSort { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setAppName("SecondSort") .setMaster("local") val sc = new SparkContext(conf) val lines = sc.textFile("D:\test-file\sort.txt", 1) val pairs = lines.map { line => ( new SecondSortKey(line.split(" ")(0).toInt, line.split(" ")(1).toInt), line)} val sortedPairs = pairs.sortByKey() val sortedLines = sortedPairs.map(sortedPair => sortedPair._2) sortedLines.foreach { sortedLine => println(sortedLine) } } }
三、topn
1、要求
1、对文本文件内的数字,取最大的前3个。 2、对每个班级内的学生成绩,取出前3名。(分组取topn) 3、课后作业:用Scala来实现分组取topn。
2、获取文本内最大的前三个数
---------java实现---------- package cn.spark.study.core; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; public class Top3 { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("Top3Java").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("D:\test-file\top.txt"); JavaPairRDD<Integer, String> pairs = lines.mapToPair(new PairFunction<String, Integer, String>() { private static final long serialVersionUID = 1L; @Override public Tuple2<Integer, String> call(String t) throws Exception { return new Tuple2<Integer, String>(Integer.valueOf(t), t); } }); JavaPairRDD<Integer, String> sortedPairs = pairs.sortByKey(false); JavaRDD<Integer> sortedNumbers = sortedPairs.map(new Function<Tuple2<Integer,String>, Integer>() { private static final long serialVersionUID = 1L; @Override public Integer call(Tuple2<Integer, String> v1) throws Exception { return v1._1; } }); List<Integer> sortedNumberList = sortedNumbers.take(3); //此时sortedNumberList是: [9, 7, 6] for(Integer num : sortedNumberList) { System.out.println(num); } sc.close(); } } ---------scala实现---------- package cn.spark.study.core import org.apache.spark.SparkConf import org.apache.spark.SparkContext /** * @author Administrator */ object Top3 { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setAppName("Top3") .setMaster("local") val sc = new SparkContext(conf) val lines = sc.textFile("D:\test-file\top.txt", 1) val pairs = lines.map { line => (line.toInt, line) } val sortedPairs = pairs.sortByKey(false) val sortedNumbers = sortedPairs.map(sortedPair => sortedPair._1) val top3Number = sortedNumbers.take(3) for(num <- top3Number) { println(num) } } }
3、对每个班级内的学生成绩,取出前3名。(分组取topn)
----java实现----- package cn.spark.study.core; import java.util.Arrays; import java.util.Iterator; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; /** * 分组取top3 * @author Administrator * */ public class GroupTop3 { public static void main(String[] args) { SparkConf conf = new SparkConf() .setAppName("Top3") .setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("D:\test-file\score.txt"); JavaPairRDD<String, Integer> pairs = lines.mapToPair( new PairFunction<String, String, Integer>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Integer> call(String line) throws Exception { String[] lineSplited = line.split(" "); return new Tuple2<String, Integer>(lineSplited[0],
//Integer.valueOf()可以将基本类型int转换为包装类型Integer,或者将String转换成Integer,String如果为Null或“”都会报错; Integer.valueOf(lineSplited[1])); } }); JavaPairRDD<String, Iterable<Integer>> groupedPairs = pairs.groupByKey(); JavaPairRDD<String, Iterable<Integer>> top3Score = groupedPairs.mapToPair( new PairFunction<Tuple2<String,Iterable<Integer>>, String, Iterable<Integer>>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Iterable<Integer>> call( Tuple2<String, Iterable<Integer>> classScores) throws Exception { Integer[] top3 = new Integer[3]; String className = classScores._1; Iterator<Integer> scores = classScores._2.iterator(); while(scores.hasNext()) { Integer score = scores.next(); for(int i = 0; i < 3; i++) { if(top3[i] == null) { top3[i] = score; break; } else if(score > top3[i]) { for(int j = 2; j > i; j--) { top3[j] = top3[j - 1]; } top3[i] = score; break; } } } return new Tuple2<String, Iterable<Integer>>(className, Arrays.asList(top3)); } }); top3Score.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() { private static final long serialVersionUID = 1L; @Override public void call(Tuple2<String, Iterable<Integer>> t) throws Exception { System.out.println("class: " + t._1); Iterator<Integer> scoreIterator = t._2.iterator(); while(scoreIterator.hasNext()) { Integer score = scoreIterator.next(); System.out.println(score); } System.out.println("======================================="); } }); sc.close(); } } -----scala实现------ package cn.spark.study.core import org.apache.spark.SparkConf import org.apache.spark.SparkContext object GroupTop3 { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("GroupTop3Scala").setMaster("local") val context = new SparkContext(conf) val linesRDD = context.textFile("D:\test-file\score.txt") val studentScores = linesRDD.map(line => (line.split(" ")(0), line.split(" ")(1).toInt)) val groupStudentScores = studentScores.groupByKey() val result = groupStudentScores.map(student => { val maxScore = new Array[Int](3) val scores = student._2 for(score <- scores) { var flag = true for(i <- 0 until maxScore.length if flag) { if(maxScore(i) == Nil) { maxScore(i) = score flag = false }else{ if(maxScore(i) < score) { for(j <- (i + 1 to maxScore.length - 1).reverse){ maxScore(j) = maxScore(j - 1) } maxScore(i) = score flag = false } } } } (student._1, maxScore) }) result.foreach(result =>{ print(result._1 + "班级前三明成绩为:") for(i <- 0 until result._2.length) { if(i == 0) print(result._2(i)) else print("," + result._2(i)) } println() }) } }