1、利用scala语言开发spark的worcount程序(本地运行)
package com.zy.spark import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} //todo:利用scala语言来实现spark的wordcount程序 object WordCount { def main(args: Array[String]): Unit = { //1、创建SparkConf对象,设置appName和master local[2]表示本地采用2个线程去运行任务 val sparkConf: SparkConf = new SparkConf().setAppName("WordCount").setMaster("local[2]") //2、创建SparkContext 该对象是所有spark程序的执行入口,它会创建DAGScheduler和TaskScheduler val sc = new SparkContext(sparkConf) //设置日志输出级别 sc.setLogLevel("warn") //3、读取数据文件 val data: RDD[String] = sc.textFile("D:\words.txt") //4、切分每一行获取所有单词 val words: RDD[String] = data.flatMap(_.split(" ")) //5、每个单词计为1 val wordAndOne: RDD[(String, Int)] = words.map((_, 1)) //6、相同单词出现的所有的1累加 val result: RDD[(String, Int)] = wordAndOne.reduceByKey(_ + _) //按照单词出现的次数降序排列 val sortRDD: RDD[(String, Int)] = result.sortBy(x => x._2, false) //7、收集数据,打印输出 val finalResult: Array[(String, Int)] = sortRDD.collect() finalResult.foreach(println) //8、关闭sc sc.stop() } }
2、利用scala语言开发spark的wordcount程序(集群运行)
package com.zy.spark import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD //todo:利用scala语言开发spark的wordcount程序(集群运行) object WordCount_Online { def main(args: Array[String]): Unit = { //1、创建SparkConf对象,设置appName val sparkConf: SparkConf = new SparkConf().setAppName("WordCount_Online") //2、创建SparkContext 该对象是所有spark程序的执行入口,它会创建DAGScheduler和TaskScheduler val sc = new SparkContext(sparkConf) //设置日志输出级别 sc.setLogLevel("warn") //3、读取数据文件 args(0)为文件地址参数 val data: RDD[String] = sc.textFile(args(0)) //4、切分每一行获取所有单词 val words: RDD[String] = data.flatMap(_.split(" ")) //5、每个单词计为1 val wordAndOne: RDD[(String, Int)] = words.map((_, 1)) //6、相同单词出现的所有的1累加 val result: RDD[(String, Int)] = wordAndOne.reduceByKey(_ + _) //7、把结果数据保存到hdfs上 args(1)是保存到hdfs的目录参数 result.saveAsTextFile(args(1)) //8、关闭sc sc.stop() } }
最后打成jar包 到集群上执行
spark-submit --master spark://node1:7077 --class cn.itcast.spark.WordCount_Online --executor-memory 1g --total-executor-cores 2 original-spark_xxx-1.0-SNAPSHOT.jar /words.txt /out
3、利用java语言开发spark的wordcount程序(本地运行)
package com.zy.spark; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import scala.Tuple2; import java.util.Arrays; import java.util.Iterator; import java.util.List; //todo:利用java语言开发spark的wordcount程序(本地运行) public class WordCount_Java { public static void main(String[] args) { //1、创建SparkConf对象 SparkConf sparkConf = new SparkConf().setAppName("WordCount_Java").setMaster("local[2]"); //2、创建JavaSparkContext对象 JavaSparkContext jsc = new JavaSparkContext(sparkConf); //3、读取数据文件 JavaRDD<String> data = jsc.textFile("D:\words.txt"); //4、切分每一行获取所有的单词 JavaRDD<String> words = data.flatMap(new FlatMapFunction<String, String>() { public Iterator<String> call(String line) throws Exception { String[] words = line.split(" "); return Arrays.asList(words).iterator(); } }); //5、每个单词计为1 JavaPairRDD<String, Integer> wordAndOne = words.mapToPair(new PairFunction<String, String, Integer>() { public Tuple2<String, Integer> call(String word) throws Exception { return new Tuple2<String, Integer>(word, 1); } }); //6、相同单词出现1累加 JavaPairRDD<String, Integer> result = wordAndOne.reduceByKey(new Function2<Integer, Integer, Integer>() { public Integer call(Integer v1, Integer v2) throws Exception { return v1 + v2; } }); //按照单词出现的次数降序排列 (单词,次数)------>(次数,单词).sortByKey------->(单词,次数) JavaPairRDD<Integer, String> reverseRDD = result.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() { public Tuple2<Integer, String> call(Tuple2<String, Integer> t) throws Exception { return new Tuple2<Integer, String>(t._2, t._1); } }); JavaPairRDD<String, Integer> sortedRDD = reverseRDD.sortByKey(false).mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() { public Tuple2<String, Integer> call(Tuple2<Integer, String> t) throws Exception { return new Tuple2<String, Integer>(t._2, t._1); } }); //7、收集数据打印输出 List<Tuple2<String, Integer>> finalResult = sortedRDD.collect(); for (Tuple2<String, Integer> tuple : finalResult) { System.out.println("单词:" + tuple._1 + " 次数:" + tuple._2); } //8、关闭jsc jsc.stop(); } }