• java spark sql 计算各个省份广告点击数的top3


    同这个需求一样,用spark sql的方式实现(相对来说简单一点)

    https://www.cnblogs.com/7749ha/p/12909115.html

    package sparksql;
    import org.apache.spark.SparkContext;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.sql.Dataset;
    import org.apache.spark.sql.Row;
    import org.apache.spark.sql.RowFactory;
    import org.apache.spark.sql.SparkSession;
    import org.apache.spark.sql.types.DataTypes;
    import org.apache.spark.sql.types.StructField;
    import org.apache.spark.sql.types.StructType;
    import org.apache.spark.api.java.function.Function;
    
    import java.util.ArrayList;
    import java.util.List;
    
    /**
     * # _*_ coding:utf-8 _*_
     * # Author:xiaoshubiao
     * # Time : 2020/5/15 16:44
     **/
    public class sparksql_test {
        public static void main(String[] args) throws Exception{
            SparkSession spark = SparkSession
                    .builder()
                    .appName("Java Spark SQL basic example")
                    .getOrCreate();
            SparkContext sc = spark.sparkContext();
            JavaRDD<String> stringJavaRDD = sc.textFile("D:/tmp/rizhi.txt", 1).toJavaRDD();
            // 创建列
            String schemaString = "ts province city user ad";
            List<StructField> fields = new ArrayList<>();
            for(String fieldname:schemaString.split(" ")){
                fields.add(DataTypes.createStructField(fieldname,DataTypes.StringType,true));
            }
            // 添加列
            StructType structType = DataTypes.createStructType(fields);
            JavaRDD<Row> rowRDD = stringJavaRDD.map(
                    (Function<String, Row>) record -> {
                String[] attributes = record.split(" ");
                return RowFactory.create(attributes[0], attributes[1].trim(),attributes[2],attributes[3],attributes[4]);
            });
            Dataset<Row> dataFrame = spark.createDataFrame(rowRDD, structType);
            // 显示查看
            dataFrame.show();
            // 创建临时view
            dataFrame.createTempView("people");
            // sql处理需求
            String sql = "select * from (select province,ad,c,row_number() over(partition by province order by c desc) as ind from (select province,ad,count(*) as c from people group by province,ad)t)t where ind <3";
            spark.sql(sql).show();
    
    
        }
    }
  • 相关阅读:
    《重构》读书笔记
    每周总结
    《修改代码的艺术》读书笔记
    每周总结
    每周总结
    《修改代码的艺术》读书笔记
    每周总结
    第二周周总结
    以淘宝网为例,描绘质量属性的六个常见属性场景
    机器学习第八讲
  • 原文地址:https://www.cnblogs.com/7749ha/p/12910407.html
Copyright © 2020-2023  润新知