参考:http://spark.apache.org/docs/latest/sql-programming-guide.html
1)使用maven构建Scala工程。
1.1)新增pom依赖包文件如下:
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
<version>2.2.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.10</artifactId>
<version>2.2.2</version>
<!--<scope>runtime</scope>-->
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>2.2.2</version>
</dependency>
1.2)新建Scala类,代码及功能描述如下:
package com.fishjar.sparksql
import org.apache.spark.sql._
import org.apache.spark.sql.expressions.{Aggregator, MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types._
/**
* Created by fishjar on 2018/8/22.
*/
object SparkSqlDemo {
//case class的定义要在引用case class函数的外面。因为我只有一个main函数,所以把case class挪到了外面
case class Person(name: String, age: Long)
def main(args: Array[String]): Unit = {
val path1="D:\test\people.json"
val path2="D:\test\people.txt"
//1)创建SparkSession
System.setProperty("hadoop.home.dir","F:\hadp-dev\hadoop-2.7.1");
val spark = SparkSession
.builder()
.appName("Spark SQL basic example")
.master("local[2]")
.config("spark.some.config.option", "some-value")
.getOrCreate()
// For implicit conversions like converting RDDs to DataFrames
import spark.implicits._
//2)创建DataFrame
val df = spark.read.json(path1)
//2.1)显示DataFrame的内容
df.show()
//2.2)打印树格式中的schema
df.printSchema()
//2.3)选择name列
df.select("name").show()
//2.4)选择所有数据,并且将age列加1
df.select($"name", $"age" + 1).show()
//2.5)过滤年龄大于21的数据
df.filter($"age" > 21).show()
//2.6)根据年龄来分类统计
df.groupBy("age").count().show()
//3)将DataFrame以试图的方式创建,并返回结果集
//3.1)将DataFrame注册为一个视图
df.createOrReplaceTempView("people")
//3.2)查询视图
val sqlDF = spark.sql("SELECT * FROM people")
println("*******createOrReplaceTempView*******")
sqlDF.show()
//3.3)将DataFrame注册为一个全局的视图
df.createGlobalTempView("people")
println("*******createGlobalTempView*****")
spark.sql("SELECT * FROM global_temp.people").show()
spark.newSession().sql("SELECT * FROM global_temp.people").show()
//4创建数据集
println("*******4创建数据集*****")
// Encoders are created for case classes
val caseClassDS = Seq(Person("Andy", 32)).toDS()
caseClassDS.show()
// Encoders for most common types are automatically provided by importing spark.implicits._
val primitiveDS = Seq(1, 2, 3).toDS()
primitiveDS.map(_ + 1).collect() // Returns: Array(2, 3, 4)
for(arg <- primitiveDS ) println(arg)
// DataFrames can be converted to a Dataset by providing a class. Mapping will be done by name
val peopleDS = spark.read.json(path1).as[Person]
println("********peopleDS*******")
peopleDS.show()
//5用反射来间接的使用schema
//Inferring the Schema Using Reflection
println("5用反射来间接的使用schema")
// For implicit conversions from RDDs to DataFrames
import spark.implicits._
// Create an RDD of Person objects from a text file, convert it to a Dataframe
val peopleDF = spark.sparkContext
.textFile("D:\test\aaa.txt")
.map(_.split(","))
.map(attributes => Person(attributes(1), attributes(0).trim.toInt))
.toDF()
// Register the DataFrame as a temporary view
peopleDF.createOrReplaceTempView("people")
// SQL statements can be run by using the sql methods provided by Spark
val teenagersDF = spark.sql("SELECT name, age FROM people WHERE age BETWEEN 13 AND 19")
teenagersDF.show(1)
// The columns of a row in the result can be accessed by field index
teenagersDF.map(teenager => "Name: " + teenager(0)).show()
// or by field name
teenagersDF.map(teenager => "Name: " + teenager.getAs[String]("name")).show()
// No pre-defined encoders for Dataset[Map[K,V]], define explicitly
implicit val mapEncoder = org.apache.spark.sql.Encoders.kryo[Map[String, Any]]
// Primitive types and case classes can be also defined as
// implicit val stringIntMapEncoder: Encoder[Map[String, Any]] = ExpressionEncoder()
// row.getValuesMap[T] retrieves multiple columns at once into a Map[String, T]
println("encoders")
teenagersDF.map(teenager => teenager.getValuesMap[Any](List("name", "age"))).collect()
//6 Programmatically Specifying the Schema
println("**********Programmatically Specifying the Schema")
import org.apache.spark.sql.types._
// Create an RDD
val peopleRDD = spark.sparkContext.textFile("d:\test\people.txt")
// The schema is encoded in a string
val schemaString = "name age"
// Generate the schema based on the string of schema
val fields = schemaString.split(" ")
.map(fieldName => StructField(fieldName, StringType, nullable = true))
val schema = StructType(fields)
// Convert records of the RDD (people) to Rows
val rowRDD = peopleRDD
.map(_.split(","))
.map(attributes => Row(attributes(0), attributes(1).trim))
// Apply the schema to the RDD
val peopleDF2 = spark.createDataFrame(rowRDD, schema)
// Creates a temporary view using the DataFrame
peopleDF2.createOrReplaceTempView("people")
// SQL can be run over a temporary view created using DataFrames
val results = spark.sql("SELECT name FROM people")
results.map(attributes => "Name: " + attributes(0)).show()
results.show()
peopleDF2.show()
//7 用户自定义函数
println("*********7用户自定义函数************")
//Untyped User-Defined Aggregate Functions
// Register the function to access it
spark.udf.register("myAverage", MyAverage)
val df2 = spark.read.json("D:\test\employees.json")
df2.createOrReplaceTempView("employees")
df2.show()
val result = spark.sql("SELECT myAverage(salary) as average_salary FROM employees")
result.show()
//8 用户自定义的类型安全的函数
//Type-Safe User-Defined Aggregate Functions
println("**********用户自定义的类型安全的函数**********")
val ds = spark.read.json("D:\test\employees.json").as[Employee]
ds.show()
// Convert the function to a `TypedColumn` and give it a name
val averageSalary = MyAverage2.toColumn.name("average_salary")
val result2 = ds.select(averageSalary)
result2.show()
}
object MyAverage extends UserDefinedAggregateFunction {
// Data types of input arguments of this aggregate function
def inputSchema: StructType = StructType(StructField("inputColumn", LongType) :: Nil)
// Data types of values in the aggregation buffer
def bufferSchema: StructType = {
StructType(StructField("sum", LongType) :: StructField("count", LongType) :: Nil)
}
// The data type of the returned value
def dataType: DataType = DoubleType
// Whether this function always returns the same output on the identical input
def deterministic: Boolean = true
// Initializes the given aggregation buffer. The buffer itself is a `Row` that in addition to
// standard methods like retrieving a value at an index (e.g., get(), getBoolean()), provides
// the opportunity to update its values. Note that arrays and maps inside the buffer are still
// immutable.
def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer(0) = 0L
buffer(1) = 0L
}
// Updates the given aggregation buffer `buffer` with new input data from `input`
def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buffer(0) = buffer.getLong(0) + input.getLong(0)
buffer(1) = buffer.getLong(1) + 1
}
}
// Merges two aggregation buffers and stores the updated buffer values back to `buffer1`
def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
buffer1(0) = buffer1.getLong(0) + buffer2.getLong(0)
buffer1(1) = buffer1.getLong(1) + buffer2.getLong(1)
}
// Calculates the final result
def evaluate(buffer: Row): Double = buffer.getLong(0).toDouble / buffer.getLong(1)
}
case class Employee(name: String, salary: Long)
case class Average(var sum: Long, var count: Long)
object MyAverage2 extends Aggregator[Employee, Average, Double] {
// A zero value for this aggregation. Should satisfy the property that any b + zero = b
def zero: Average = Average(0L, 0L)
// Combine two values to produce a new value. For performance, the function may modify `buffer`
// and return it instead of constructing a new object
def reduce(buffer: Average, employee: Employee): Average = {
buffer.sum += employee.salary
buffer.count += 1
buffer
}
// Merge two intermediate values
def merge(b1: Average, b2: Average): Average = {
b1.sum += b2.sum
b1.count += b2.count
b1
}
// Transform the output of the reduction
def finish(reduction: Average): Double = reduction.sum.toDouble / reduction.count
// Specifies the Encoder for the intermediate value type
def bufferEncoder: Encoder[Average] = Encoders.product
// Specifies the Encoder for the final output value type
def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}
}