• SparkSQL DSL开发(Old)


    import org.apache.spark.sql.SQLContext
    import org.apache.spark.sql.expressions.Window
    import org.apache.spark.sql.hive.HiveContext
    import org.apache.spark.{SparkConf, SparkContext}


    case class Person2(name: String, age: Int, sex: String, salary: Int, deptNo: Int)

    case class Dept(deptNo: Int, deptName: String)

    object SparkSQLDSLDemo {
    def main(args: Array[String]): Unit = {
    // 1. 上下文创建
    val conf = new SparkConf()
    .setAppName("demo")
    .setMaster("local[*]")
    val sc = SparkContext.getOrCreate(conf)
    // 当使用HiveContext的时候需要给定jvm的参数:-XX:PermSize=128M -XX:MaxPermSize=256M
    val sqlContext = new HiveContext(sc)
    import sqlContext.implicits._
    import org.apache.spark.sql.functions._

    sqlContext.udf.register("sexToNum", (sex: String) => {
    sex.toUpperCase match {
    case "M" => 0
    case "F" => 1
    case _ => -1
    }
    })
    sqlContext.udf.register("self_avg", SelfAvgUDAF)

    // 2. 直接创建模拟数据
    val rdd1 = sc.parallelize(Array(
    Person2("张三", 21, "M", 1235, 1),
    Person2("李四", 20, "F", 1235, 1),
    Person2("王五", 26, "M", 1235, 1),
    Person2("小明", 25, "F", 1225, 1),
    Person2("小花", 24, "F", 1425, 1),
    Person2("小华", 23, "M", 1215, 1),
    Person2("gerry", 22, "F", 1415, 2),
    Person2("tom", 21, "F", 1855, 2),
    Person2("lili", 20, "F", 1455, 2),
    Person2("莉莉", 18, "M", 1635, 2)
    ))
    val rdd2 = sc.parallelize(Array(
    Dept(1, "部门1"),
    Dept(2, "部门2")
    ))

    val personDataFrame = rdd1.toDF()
    val deptDataFrame = rdd2.toDF()

    // ====DSL==================================
    // cache 多次使用的DataFrame
    personDataFrame.cache()
    deptDataFrame.cache()

    // select语法
    println("----select-----")
    personDataFrame.select("name", "age", "sex").show()
    personDataFrame.select($"name", $"age", $"sex".as("sex1")).show()
    personDataFrame.select(col("name").as("name1"), col("age").as("age1"), col("sex")).show()
    personDataFrame.selectExpr("name", "age", "sex", "sexToNum(sex) as sex_num").show()


    // where/filter
    println("------where/filter-------")
    personDataFrame.where("age > 22").where("sex = 'M'").where("deptNo = 1").show()
    personDataFrame.where("age > 20 AND sex='M' AND deptNo = 1").show()
    personDataFrame.where($"age" > 20 && $"sex" === "M" && $"deptNo" === 1).show()
    personDataFrame.where($"age" > 20 && $"deptNo" === 1 && ($"sex" !== "F")).show()

    // sort
    println("-----------sort--------------")
    // 全局排序
    personDataFrame.sort("salary").select("name", "salary").show()
    personDataFrame.sort($"salary".desc).select("name", "salary", "age").show()
    personDataFrame.sort($"salary".desc, $"age".asc).select("name", "salary", "age").show()
    personDataFrame.orderBy($"salary".desc, $"age".asc).select("name", "salary", "age").show()
    personDataFrame
    .repartition(5)
    .orderBy($"salary".desc, $"age")
    .select("name", "salary", "age").show()
    // 局部排序(按照分区进行排序)
    personDataFrame
    .repartition(5)
    .sortWithinPartitions($"salary".desc, $"age".asc)
    .select("name", "salary", "age")
    .show()

    // group by
    personDataFrame
    .groupBy("sex")
    .agg(
    "salary" -> "avg",
    "salary" -> "sum"
    )
    .show()
    personDataFrame
    .groupBy("sex")
    .agg(
    avg("salary").as("avg_salary"),
    min("salary").as("min_salary"),
    count(lit(1)).as("cnt")
    )
    .show()
    personDataFrame
    .groupBy("sex")
    .agg(
    "salary" -> "self_avg"
    )
    .show()

    // limit
    personDataFrame.limit(2).show()

    // ==join===============
    println("----------join--------------------")
    personDataFrame.join(deptDataFrame).show()
    // 无法判断deptNo属于哪个DataFrame的会报错
    // personDataFrame.join(deptDataFrame, $"deptNo" === $"deptNo")
    personDataFrame.join(deptDataFrame.toDF("col1", "deptName"), $"deptNo" === $"col1", "inner").show()
    personDataFrame.join(deptDataFrame, "deptNo").show()
    personDataFrame
    .join(deptDataFrame.toDF("deptNo", "name"), Seq("deptNo"), "left_outer")
    .toDF("no", "name", "age", "sex", "sal", "dname")
    .show()

    // ===窗口分析函数=======必须要是是使用HiveContext对象
    /** *
    * 按照deptNo分组,组内按照salary进行排序,获取每个部门前3个销售额的用户信息
    * select *
    * from
    * (select *, ROW_NUMBER() OVER (Partition by deptNo Order by salary desc) as rnk
    * from person) as tmp
    * where tmp.rnk <= 3
    */
    val w = Window.partitionBy("deptNo").orderBy($"salary".desc, $"age".asc)
    personDataFrame
    .select(
    $"name", $"age", $"deptNo", $"salary",
    row_number().over(w).as("rnk")
    )
    .where("rnk <= 3")
    .show()

    // 清除缓存
    personDataFrame.unpersist()
    personDataFrame.unpersist()
    }
    }
    ————————————————
    版权声明:本文为CSDN博主「weixin_40652340」的原创文章,遵循CC 4.0 by-sa版权协议,转载请附上原文出处链接及本声明。
    原文链接:https://blog.csdn.net/weixin_40652340/article/details/79207455

  • 相关阅读:
    Bzoj1027 [JSOI2007]合金
    Bzoj4318 OSU!
    Bzoj3931 [CQOI2015]网络吞吐量
    Bzoj3551 [ONTAK2010]Peaks加强版
    Bzoj3545 [ONTAK2010]Peaks
    Bzoj4031 [HEOI2015]小Z的房间
    Bzoj3613 [Heoi2014]南园满地堆轻絮
    Bzoj4516 [Sdoi2016]生成魔咒
    HDU1847 Good Luck in CET-4 Everybody!
    HDU1846 Brave Game
  • 原文地址:https://www.cnblogs.com/Diyo/p/11410839.html
Copyright © 2020-2023  润新知