一、方式1:反射的方法,但是生产上不建议使用。因为case class只能定义22个字段,有所限制。
package com.spark.sql
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
object CovertRdd {
case class Person(name: String, age: Long)
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local[2]").appName("RDD Covert DF").getOrCreate()
private def runInferSchemaExample(spark: SparkSession): Unit = {
* 方法1:通过反射方法 将RDD转成一个DF
* */
import spark.implicits._
val peopleDF = spark.sparkContext
.map(attributes => Person(attributes(0), attributes(1).trim.toInt)).toDF()
val teenagersDF = spark.sql("SELECT name, age FROM people WHERE age BETWEEN 13 AND 19")
teenagersDF.map(teenager => "Name: " + teenager(0) + ",Age:" + teenager(1)).show()
* 方法1:通过编程方法,将RDD转成一个DF
* */
private def runProgrammaticSchemaExample(spark: SparkSession): Unit = {
import spark.implicits._
// Create an RDD
val peopleRDD = spark.sparkContext.textFile("file:///D:/ruoze/people.txt")
// The schema is encoded in a string
val schemaString = "name age"
// Generate the schema based on the string of schema
val fields = schemaString.split(" ")
.map(fieldName => StructField(fieldName, StringType, nullable = true))
val schema = StructType(fields)
// Convert records of the RDD (people) to Rows
val rowRDD = peopleRDD
.map(attributes => Row(attributes(0), attributes(1).trim))
// Apply the schema to the RDD
val peopleDF = spark.createDataFrame(rowRDD, schema)
// Creates a temporary view using the DataFrame
// SQL can be run over a temporary view created using DataFrames
val results = spark.sql("SELECT name FROM people")
// The results of SQL queries are DataFrames and support all the normal RDD operations
// The columns of a row in the result can be accessed by field index or by field name
results.map(attributes => "Name: " + attributes(0)).show()