• spark 基于key排序的wordcount


    java

     1 /**
     2  * 根据单词次数排序的wordcount
     3  * @author Tele
     4  *
     5  */
     6 public class SortWordCount {
     7     private static SparkConf conf = new SparkConf().setMaster("local").setAppName("sortwordcount");
     8     private static JavaSparkContext jsc = new JavaSparkContext(conf);
     9     private static String path = "D:\inputword\result.txt";
    10 
    11     public static <U> void main(String[] args) {
    12         JavaRDD<String> rdd = jsc.textFile(path);
    13 
    14         /*
    15          * JavaRDD<String> lines = rdd.flatMap(new FlatMapFunction<String,String>() {
    16          * 
    17          * private static final long serialVersionUID = 1L;
    18          * 
    19          * @Override public Iterator<String> call(String t) throws Exception { return
    20          * Arrays.asList(t.split(" ")).iterator(); } });
    21          * 
    22          * JavaPairRDD<String, Integer> tuples = lines.mapToPair(new
    23          * PairFunction<String,String,Integer>() {
    24          * 
    25          * private static final long serialVersionUID = 1L;
    26          * 
    27          * @Override public Tuple2<String,Integer> call(String t) throws Exception {
    28          * return new Tuple2<String,Integer>(t,1); } });
    29          */
    30 
    31         JavaPairRDD<String, Integer> tuples = rdd.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
    32 
    33             private static final long serialVersionUID = 1L;
    34 
    35             @Override
    36             public Iterator<Tuple2<String, Integer>> call(String t) throws Exception {
    37                 Stream<Tuple2<String, Integer>> stream = Arrays.asList(t.split(" ")).stream()
    38                         .map(i -> new Tuple2<>(i, 1));
    39                 return stream.iterator();
    40             }
    41         });
    42 
    43         JavaPairRDD<String, Integer> wc = tuples.reduceByKey(new Function2<Integer, Integer, Integer>() {
    44 
    45             private static final long serialVersionUID = 1L;
    46 
    47             @Override
    48             public Integer call(Integer v1, Integer v2) throws Exception {
    49                 return v1 + v2;
    50             }
    51         });
    52 
    53         // 将词频与单词互换位置
    54         JavaPairRDD<Integer, String> cw = wc.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
    55 
    56             private static final long serialVersionUID = 1L;
    57 
    58             @Override
    59             public Tuple2<Integer, String> call(Tuple2<String, Integer> t) throws Exception {
    60                 return new Tuple2<Integer, String>(t._2, t._1);
    61             }
    62         });
    63 
    64         JavaPairRDD<Integer, String> result = cw.sortByKey(false);
    65         result.foreach(new VoidFunction<Tuple2<Integer, String>>() {
    66 
    67             private static final long serialVersionUID = 1L;
    68 
    69             @Override
    70             public void call(Tuple2<Integer, String> t) throws Exception {
    71                 System.out.println(t._2 + "----" + t._1);
    72             }
    73         });
    74 
    75         // 也可以在排序完毕后换成单词-词频的形式
    76         /*
    77          * JavaPairRDD<String, Integer> result = cw.sortByKey(false).mapToPair(new
    78          * PairFunction<Tuple2<Integer,String>,String,Integer>() {
    79          * 
    80          * private static final long serialVersionUID = 1L;
    81          * 
    82          * @Override public Tuple2<String,Integer> call(Tuple2<Integer, String> t)
    83          * throws Exception { return new Tuple2<String,Integer>(t._2,t._1); } });
    84          * 
    85          * result.foreach(new VoidFunction<Tuple2<String,Integer>>() {
    86          * 
    87          * private static final long serialVersionUID = 1L;
    88          * 
    89          * @Override public void call(Tuple2<String, Integer> t) throws Exception {
    90          * System.out.println(t._1 + "-------" + t._2); } });
    91          */
    92 
    93         jsc.close();
    94     }
    95 }

    scala

     1 object SortWordCount {
     2   def main(args: Array[String]): Unit = {
     3     val conf = new SparkConf().setMaster("local").setAppName("sortwordcount");
     4     val sc = new SparkContext(conf);
     5 
     6     val rdd = sc.textFile("D:\inputword\result.txt", 1);
     7     
     8     val wordcount = rdd.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _);
     9     wordcount.map(t => (t._2, t._1)).sortByKey(false, 1).map(t => (t._2, t._1)).foreach(t => println(t._1 + "-----" + t._2));
    10 
    11   }
    12 }
  • 相关阅读:
    Java中的CopyOnWrite
    Collection和Collections的区别
    java中值类型与引用类型的关系
    Xml的用途
    js弹框的3种方法
    文件夹重定向失败解决方案
    网络管理人员经常遇到的十个问题(转载)
    QTP之下拉列表单选框…
    Windows脚本宿主对象模型
    QTP报错“缺少对象WScript”
  • 原文地址:https://www.cnblogs.com/tele-share/p/10282082.html
Copyright © 2020-2023  润新知