scala> import org.apache.spark.sql.SparkSession import org.apache.spark.sql.SparkSession scala> val spark=SparkSession.builder().getOrCreate() spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@2bdab835 //使支持RDDs转换为DataFrames及后续sql操作 scala> import spark.implicits._ import spark.implicits._ scala> val df = spark.read.json("file:///usr/local/spark/examples/src/main/resources/people.json") df: org.apache.spark.sql.DataFrame = [age: bigint, name: string] scala> df.show() +----+-------+ | age| name| +----+-------+ |null|Michael| | 30| Andy| | 19| Justin| +----+-------+ // 打印模式信息 scala> df.printSchema() root |-- age: long (nullable = true) |-- name: string (nullable = true) // 选择多列 scala> df.select(df("name"),df("age")+1).show() +-------+---------+ | name|(age + 1)| +-------+---------+ |Michael| null| | Andy| 31| | Justin| 20| +-------+---------+ // 条件过滤 scala> df.filter(df("age") > 20 ).show() +---+----+ |age|name| +---+----+ | 30|Andy| +---+----+ // 分组聚合 scala> df.groupBy("age").count().show() +----+-----+ | age|count| +----+-----+ | 19| 1| |null| 1| | 30| 1| +----+-----+ // 排序 scala> df.sort(df("age").desc).show() +----+-------+ | age| name| +----+-------+ | 30| Andy| | 19| Justin| |null|Michael| +----+-------+ //多列排序 scala> df.sort(df("age").desc, df("name").asc).show() +----+-------+ | age| name| +----+-------+ | 30| Andy| | 19| Justin| |null|Michael| +----+-------+ //对列进行重命名 scala> df.select(df("name").as("username"),df("age")).show() +--------+----+ |username| age| +--------+----+ | Michael|null| | Andy| 30| | Justin| 19| +--------+----+ //使用spark sql语句 scala>df.createTempView("table1") scala> spark.sql("select * from table1 limit 10")
以上是我们常用的dataframe的基础操作
具体见一下博客
https://blog.csdn.net/dabokele/article/details/52802150
SparkSQL官网
http://spark.apache.org/docs/1.6.2/api/scala/index.html#org.apache.spark.sql.DataFrame