• Spark项目之电商用户行为分析大数据平台之(十二)Spark上下文构建及模拟数据生成


    一、模拟生成数据

      1 package com.bw.test;
      2 
      3 import java.util.ArrayList;
      4 import java.util.Arrays;
      5 import java.util.List;
      6 import java.util.Random;
      7 import java.util.UUID;
      8 
      9 import com.bw.util.DateUtils;
     10 import com.bw.util.StringUtils;
     11 import org.apache.spark.api.java.JavaRDD;
     12 import org.apache.spark.api.java.JavaSparkContext;
     13 import org.apache.spark.sql.DataFrame;
     14 import org.apache.spark.sql.Row;
     15 import org.apache.spark.sql.RowFactory;
     16 import org.apache.spark.sql.SQLContext;
     17 import org.apache.spark.sql.types.DataTypes;
     18 import org.apache.spark.sql.types.StructType;
     19 
     20 
     21 /**
     22  * 模拟数据程序
     23  * @author Administrator
     24  *
     25  */
     26 public class MockData {
     27 
     28     /**
     29      * 模拟数据
     30      * @param sc
     31      * @param sqlContext
     32      */
     33     public static void mock(JavaSparkContext sc,
     34             SQLContext sqlContext) {
     35         List<Row> rows = new ArrayList<Row>();
     36         
     37         String[] searchKeywords = new String[] {"火锅", "蛋糕", "重庆辣子鸡", "重庆小面",
     38                 "呷哺呷哺", "新辣道鱼火锅", "国贸大厦", "太古商场", "日本料理", "温泉"};
     39         String date = DateUtils.getTodayDate();
     40         String[] actions = new String[]{"search", "click", "order", "pay"};
     41         Random random = new Random();
     42         
     43         for(int i = 0; i < 100; i++) {
     44             //生产100个userID
     45             long userid = random.nextInt(100);    
     46             
     47             for(int j = 0; j < 10; j++) {
     48                 //每个userID有10个sessionID
     49                 String sessionid = UUID.randomUUID().toString().replace("-", "");  
     50                 String baseActionTime = date + " " + random.nextInt(23);
     51                 
     52                 Long clickCategoryId = null;
     53                 //每个sessionID可能会做0-100之间的action操作
     54                 for(int k = 0; k < random.nextInt(100); k++) {
     55                     long pageid = random.nextInt(10);    
     56                     String actionTime = baseActionTime + ":" + StringUtils.fulfuill(String.valueOf(random.nextInt(59))) + ":" + StringUtils.fulfuill(String.valueOf(random.nextInt(59)));
     57                     String searchKeyword = null;
     58                     Long clickProductId = null;
     59                     String orderCategoryIds = null;
     60                     String orderProductIds = null;
     61                     String payCategoryIds = null;
     62                     String payProductIds = null;
     63                     
     64                     String action = actions[random.nextInt(4)];
     65                     if("search".equals(action)) {
     66                         searchKeyword = searchKeywords[random.nextInt(10)];   
     67                     } else if("click".equals(action)) {
     68                         if(clickCategoryId == null) {
     69                             clickCategoryId = Long.valueOf(String.valueOf(random.nextInt(100)));    
     70                         }
     71                         clickProductId = Long.valueOf(String.valueOf(random.nextInt(100)));  
     72                     } else if("order".equals(action)) {
     73                         orderCategoryIds = String.valueOf(random.nextInt(100));  
     74                         orderProductIds = String.valueOf(random.nextInt(100));
     75                     } else if("pay".equals(action)) {
     76                         payCategoryIds = String.valueOf(random.nextInt(100));  
     77                         payProductIds = String.valueOf(random.nextInt(100));
     78                     }
     79                     
     80                     Row row = RowFactory.create(date, userid, sessionid, 
     81                             pageid, actionTime, searchKeyword,
     82                             clickCategoryId, clickProductId,
     83                             orderCategoryIds, orderProductIds,
     84                             payCategoryIds, payProductIds, 
     85                             Long.valueOf(String.valueOf(random.nextInt(10))));    
     86                     rows.add(row);
     87                 }
     88             }
     89         }
     90         
     91         JavaRDD<Row> rowsRDD = sc.parallelize(rows);
     92         
     93         StructType schema = DataTypes.createStructType(Arrays.asList(
     94                 DataTypes.createStructField("date", DataTypes.StringType, true),
     95                 DataTypes.createStructField("user_id", DataTypes.LongType, true),
     96                 DataTypes.createStructField("session_id", DataTypes.StringType, true),
     97                 DataTypes.createStructField("page_id", DataTypes.LongType, true),
     98                 DataTypes.createStructField("action_time", DataTypes.StringType, true),
     99                 DataTypes.createStructField("search_keyword", DataTypes.StringType, true),
    100                 DataTypes.createStructField("click_category_id", DataTypes.LongType, true),
    101                 DataTypes.createStructField("click_product_id", DataTypes.LongType, true),
    102                 DataTypes.createStructField("order_category_ids", DataTypes.StringType, true),
    103                 DataTypes.createStructField("order_product_ids", DataTypes.StringType, true),
    104                 DataTypes.createStructField("pay_category_ids", DataTypes.StringType, true),
    105                 DataTypes.createStructField("pay_product_ids", DataTypes.StringType, true),
    106                 DataTypes.createStructField("city_id", DataTypes.LongType, true)));
    107 
    108         DataFrame df = sqlContext.createDataFrame(rowsRDD, schema);
    109 
    110         df.registerTempTable("user_visit_action");  
    111         for(Row _row : df.take(1)) {
    112             System.out.println(_row);  
    113         }
    114         
    115         /**
    116          * ==================================================================
    117          */
    118         
    119         rows.clear();
    120         String[] sexes = new String[]{"male", "female"};
    121         for(int i = 0; i < 100; i ++) {
    122             long userid = i;
    123             String username = "user" + i;
    124             String name = "name" + i;
    125             int age = random.nextInt(60);
    126             String professional = "professional" + random.nextInt(100);
    127             String city = "city" + random.nextInt(100);
    128             String sex = sexes[random.nextInt(2)];
    129             
    130             Row row = RowFactory.create(userid, username, name, age, 
    131                     professional, city, sex);
    132             rows.add(row);
    133         }
    134         
    135         rowsRDD = sc.parallelize(rows);
    136         
    137         StructType schema2 = DataTypes.createStructType(Arrays.asList(
    138                 DataTypes.createStructField("user_id", DataTypes.LongType, true),
    139                 DataTypes.createStructField("username", DataTypes.StringType, true),
    140                 DataTypes.createStructField("name", DataTypes.StringType, true),
    141                 DataTypes.createStructField("age", DataTypes.IntegerType, true),
    142                 DataTypes.createStructField("professional", DataTypes.StringType, true),
    143                 DataTypes.createStructField("city", DataTypes.StringType, true),
    144                 DataTypes.createStructField("sex", DataTypes.StringType, true)));
    145         
    146         DataFrame df2 = sqlContext.createDataFrame(rowsRDD, schema2);
    147         for(Row _row : df2.take(1)) {
    148             System.out.println(_row);  
    149         }
    150         
    151         df2.registerTempTable("user_info");  
    152         
    153         /**
    154          * ==================================================================
    155          */
    156         rows.clear();
    157         
    158         int[] productStatus = new int[]{0, 1};
    159         
    160         for(int i = 0; i < 100; i ++) {
    161             long productId = i;
    162             String productName = "product" + i;
    163             String extendInfo = "{"product_status": " + productStatus[random.nextInt(2)] + "}";    
    164             
    165             Row row = RowFactory.create(productId, productName, extendInfo);
    166             rows.add(row);
    167         }
    168         
    169         rowsRDD = sc.parallelize(rows);
    170         
    171         StructType schema3 = DataTypes.createStructType(Arrays.asList(
    172                 DataTypes.createStructField("product_id", DataTypes.LongType, true),
    173                 DataTypes.createStructField("product_name", DataTypes.StringType, true),
    174                 DataTypes.createStructField("extend_info", DataTypes.StringType, true)));
    175         
    176         DataFrame df3 = sqlContext.createDataFrame(rowsRDD, schema3);
    177         for(Row _row : df3.take(1)) {
    178             System.out.println(_row);  
    179         }
    180         
    181         df3.registerTempTable("product_info"); 
    182     }
    183     
    184 }
    View Code

    二、构建Spark上下文

     1 import com.bw.conf.ConfigurationManager;
     2 import com.bw.constant.Constants;
     3 import com.bw.test.MockData;
     4 import org.apache.spark.SparkConf;
     5 import org.apache.spark.api.java.JavaSparkContext;
     6 import org.apache.spark.sql.SQLContext;
     7 
     8 
     9 /**
    10  * 用户访问session分析Spark作业
    11  *
    12  * */
    13 public class UserVisitSessionAnalyzeSpark {
    14 
    15     public static void main(String[] args) {
    16         //构建Spark上下文
    17         SparkConf sparkConf = new SparkConf();
    18         //Spark作业本地运行
    19         sparkConf.setMaster("local");
    20         //为了符合大型企业的开发需求,不能出现硬编码,创建一个Constants接口类,定义一些常量
    21         sparkConf.setAppName(Constants.SPARK_APP_NAME_SESSION);
    22 
    23         JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    24         SQLContext sqlContext = new SQLContext(jsc);
    25 
    26         mockData(jsc,sqlContext);
    27         jsc.stop();
    28     }
    29 
    30 
    31     /**
    32      * 生成模拟数据(只有本地模式,才会去生成模拟数据)
    33      * @param sc
    34      * @param sqlContext
    35      */
    36     private static void mockData(JavaSparkContext sc, SQLContext sqlContext) {
    37         boolean local = ConfigurationManager.getBoolean(Constants.SPARK_LOCAL);
    38         if(local) {
    39             MockData.mock(sc, sqlContext);
    40         }
    41     }
    42 }
    View Code

    三、打印的测试数据

    3.1 user_visit_action

    用户下的订单

    [2018-05-23,34,4ad62c0824194e5687467bb84b9beeb9,3,2018-05-23 18:27:37,null,null,null,null,null,8,64,8]

    3.2 user_info

    [0,user0,name0,26,professional11,city4,male]

    3.3 product_info

    [0,product0,{"product_status": 1}]
  • 相关阅读:
    11C++11通用为本,专用为末_2
    10C++11通用为本,专用为末_1
    09C++11保证稳定性和兼容性
    21变量名的力量_2
    NOIP2018 游记
    CF767C 经典的树形DP
    CF1A Theatre Square
    洛谷P1720 月落乌啼算钱
    洛谷P3388 缩点
    NOIP2017D2T1 奶酪 洛谷P3958
  • 原文地址:https://www.cnblogs.com/qingyunzong/p/9079144.html
Copyright © 2020-2023  润新知