• sparkSQL中的example学习(3)


    UserDefinedTypedAggregation.scala(用户可自定义类型)

    
    import org.apache.spark.sql.expressions.Aggregator
    import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
    
    object UserDefinedTypedAggregation {
    
     case class Employee(name: String, salary: Long)
     case class Average(var sum: Long, var count: Long)
    
    
     object MyAverage extends Aggregator[Employee, Average, Double] {
    
      //A zero value for this aggregation. Should satisfy the property that any b + zero = b
      def zero: Average = Average(0L, 0L)
    
      //Commine two values to produce a new value. For performance, the function may modify `buffer`
      //and return it instead of constructiong a new object
      def reduce(buffer: Average, employee: Employee): Average = {
       buffer.sum += employee.salary
       buffer.count += 1
       buffer
      }
    
      //Merge two intermediate values
      def merge(b1: Average, b2: Average): Average = {
       b1.sum += b2.sum
       b1.count += b2.count
       b1
      }
    
      //Transform the ouput of the reduction
      def finish(reducetion: Average): Double = reducetion.sum.toDouble / reducetion.count
    
      //Specifies the Encoder for the intermediate value type
      def bufferEncoder: Encoder[Average] = Encoders.product
    
      //Specifies the Encoder for the final output value type
      def outputEncoder: Encoder[Double] = Encoders.scalaDouble
     }
    
    // $example off: type_custom_aggregation$
    
    
     def main(args: Array[String]): Unit = {
      val spark = SparkSession
        .builder()
        .appName("Spark SQL user-defined Datasets aggregation example")
        .master("local")
        .getOrCreate()
    
      import spark.implicits._
    
      val ds = spark.read.json("/Users/hadoop/app/spark/examples/src/main/resources/employees.json").as[Employee]
      ds.show()
    
      val averageSalary = MyAverage.toColumn.name("average_salary")
      val result = ds.select(averageSalary)
      result.show()
    
    
    
      spark.stop()
     }
    
    }
    
    

    屏幕快照 2019-05-14 03.57.12

  • 相关阅读:
    《剑指Offer》算法题——“旋转数组”的最小数字
    驱动是如何运行的
    Logistic回归,梯度上升算法理论详解和实现
    Python 字符串前面加'r'
    Python中文编码问题(字符串前面加'u')
    最大长度回文子串(Manacher's algorithm)
    没有重复字符的子串的最大长度
    Python格式化输出
    python strip()函数和Split函数的用法总结
    Python中的sorted函数以及operator.itemgetter函数
  • 原文地址:https://www.cnblogs.com/suixingc/p/sparksql-zhong-deexample-xue-xi-3.html
Copyright © 2020-2023  润新知