• 根据PV统计出前三的热门板块,并统计出热门板块下的用户数--方式二



    根据PV统计出前三的热门板块,并统计出热门板块下的用户数--方式二

    测试数据
    java代码
      1 package com.hzf.spark.study;
      2 
      3 import java.util.ArrayList;
      4 import java.util.Collections;
      5 import java.util.Comparator;
      6 import java.util.HashMap;
      7 import java.util.Iterator;
      8 import java.util.List;
      9 import java.util.Map;
     10 import java.util.Set;
     11 
     12 import org.apache.spark.SparkConf;
     13 import org.apache.spark.api.java.JavaPairRDD;
     14 import org.apache.spark.api.java.JavaRDD;
     15 import org.apache.spark.api.java.JavaSparkContext;
     16 import org.apache.spark.api.java.function.Function;
     17 import org.apache.spark.api.java.function.PairFlatMapFunction;
     18 import org.apache.spark.api.java.function.PairFunction;
     19 import org.apache.spark.api.java.function.VoidFunction;
     20 import org.apache.spark.broadcast.Broadcast;
     21 
     22 import scala.Tuple2;
     23 
     24 public class HotChannel02 {
     25     public static void main(String[] args) {
     26         SparkConf conf = new SparkConf()
     27                 .setAppName("HotChannel")
     28                 .setMaster("local")
     29                 .set("spark.testing.memory", "2147480000");
     30         JavaSparkContext sc = new JavaSparkContext(conf);
     31         JavaRDD<String> logRDD = sc.textFile("f:/userLog");
     32         String str = "View";
     33         final Broadcast<String> broadcast = sc.broadcast(str);
     34         hotChannel(sc, logRDD, broadcast);
     35     }
     36     private static void hotChannel(JavaSparkContext sc, JavaRDD<String> logRDD, final Broadcast<String> broadcast) {
     37         JavaRDD<String> filteredLogRDD = logRDD.filter(new Function<String, Boolean>() {
     38             
     39             private static final long serialVersionUID = 1L;
     40 
     41             @Override
     42             public Boolean call(String v1) throws Exception {
     43                 String actionParam = broadcast.value();
     44                 String action = v1.split("	")[5];
     45                 return actionParam.equals(action);
     46             }
     47         });
     48         
     49         JavaPairRDD<String, String> channel2nullRDD = filteredLogRDD.mapToPair(new PairFunction<String, String,String>() {
     50             
     51             private static final long serialVersionUID = 1L;
     52 
     53             @Override
     54             public Tuple2<String, String> call(String val) throws Exception {
     55                 String channel = val.split("	")[4];
     56                 
     57                 return new Tuple2<String, String>(channel,null);
     58             }
     59         });
     60         Map<String, Object> channelPVMap = channel2nullRDD.countByKey();
     61         Set<String> keySet = channelPVMap.keySet();
     62         List<SortObj> channels  = new ArrayList<>();
     63         for(String channel : keySet){ 
     64             channels.add(new SortObj(channel, Integer.valueOf(channelPVMap.get(channel)+"")));
     65         }
     66         Collections.sort(channels, new Comparator<SortObj>() {
     67 
     68             @Override
     69             public int compare(SortObj o1, SortObj o2) {
     70                 return o2.getValue() - o1.getValue();
     71             }
     72         });
     73         
     74         List<String> hotChannelList = new ArrayList<>();
     75         for (int i = 0; i < 3; i++) {
     76             hotChannelList.add(channels.get(i).getKey());
     77         }
     78         
     79         
     80         final Broadcast<List<String>> hotChannelListBroadcast = sc.broadcast(hotChannelList);
     81         
     82          
     83         JavaRDD<String> filtedRDD = logRDD.filter(new Function<String, Boolean>() {
     84 
     85             @Override
     86             public Boolean call(String v1) throws Exception {
     87                 List<String> hostChannels = hotChannelListBroadcast.value();
     88                 String channel = v1.split("	")[4];
     89                  String userId = v1.split("	")[2];
     90                 return hostChannels.contains(channel) && !"null".equals(userId);
     91             }
     92         });
     93         
     94         JavaPairRDD<String, String> user2ChannelRDD = filtedRDD.mapToPair(new PairFunction<String, String,String>() {
     95 
     96             private static final long serialVersionUID = 1L;
     97 
     98             @Override
     99             public Tuple2<String, String> call(String val) throws Exception {
    100                 String[] splited = val.split("	");
    101                 String userId = splited[2];
    102                 String channel = splited[4];
    103                 return new Tuple2<String, String>(userId,channel);
    104             }
    105         });
    106         
    107         JavaPairRDD<String, String> userVistChannelsRDD = user2ChannelRDD.groupByKey().flatMapToPair(new PairFlatMapFunction<Tuple2<String,Iterable<String>>, String, String>() {
    108 
    109             private static final long serialVersionUID = 1L;
    110 
    111             @Override
    112             public Iterable<Tuple2<String, String>> call(Tuple2<String, Iterable<String>> tuple) throws Exception {
    113                 String userId = tuple._1;
    114                 Iterator<String> iterator = tuple._2.iterator();
    115                 Map<String, Integer> channelMap = new HashMap<>();
    116                 while (iterator.hasNext()) {
    117                     String channel = iterator.next();
    118                     Integer count = channelMap.get(channel);
    119                     if(count == null)
    120                         count = 1;
    121                     else
    122                         count++;
    123                     channelMap.put(channel, count);
    124                 }
    125                 
    126                 List<Tuple2<String, String>> list = new ArrayList<>();
    127                 Set<String> keys = channelMap.keySet();
    128                 for(String channel : keys){
    129                      Integer channelNum  = channelMap.get(channel);
    130                      list.add(new Tuple2<String, String>(channel, userId + "_" + channelNum));
    131                 }
    132                 return list;
    133             }
    134         });
    135         
    136         
    137         userVistChannelsRDD.groupByKey().foreach(new VoidFunction<Tuple2<String,Iterable<String>>>() {
    138 
    139             private static final long serialVersionUID = 1L;
    140 
    141             @Override
    142             public void call(Tuple2<String, Iterable<String>> tuple) throws Exception {
    143                 String channel = tuple._1;
    144                 Iterator<String> iterator = tuple._2.iterator();
    145                 List<SortObj> list = new ArrayList<>();
    146                 while (iterator.hasNext()) {
    147                     String ucs = iterator.next();
    148                     String[] splited = ucs.split("_");
    149                     String userId = splited[0];
    150                     Integer num = Integer.valueOf(splited[1]);
    151                     list.add(new SortObj(userId, num));
    152                 }
    153                 
    154                 Collections.sort(list,new Comparator<SortObj>() {
    155 
    156                     @Override
    157                     public int compare(SortObj o1, SortObj o2) {
    158                         return o2.getValue() - o1.getValue();
    159                     }
    160                 });
    161                 
    162                 System.out.println("HOT_CHANNLE:"+channel);
    163                 for(int i = 0 ; i < 3 ; i++){
    164                     SortObj sortObj = list.get(i);
    165                     System.out.println(sortObj.getKey() + "===" + sortObj.getValue());
    166                 }
    167             }
    168         });
    169     }
    170 }
    View Code
    result

     

     
  • 相关阅读:
    wingIDE Pro6 破解教程
    C++中的访问权限
    解决wine中文字体方块或乱码
    linux下目录的作用
    linux下查看系统信息
    Windows Eclipse Maven 安装
    Centos SVN 搭建
    Mysql MyISAM 与 InnoDB 效率
    Linux删除除指定后缀外的所有文件
    mysql 多个timestamp设置自动更新 错误:there can be only one TIMESTAMP column with CURRENT_TIMESTAMP
  • 原文地址:https://www.cnblogs.com/haozhengfei/p/5cae4b2ac7fbd25c4e11c4bd1849c1a0.html
Copyright © 2020-2023  润新知