• Spark DataFrame常用API


    Spark  DataFrame常用API

    package com.imooc.bigdata.chapter04
    
    import org.apache.spark.sql.{DataFrame, SparkSession}
    
    object DataFrameAPIApp {
    
      def main(args: Array[String]): Unit = {
    
        val spark = SparkSession.builder().master("local").appName("DataFrameAPIApp").getOrCreate()
        import spark.implicits._
    
    
         val people: DataFrame = spark.read.json("E:\06-work\03-java\01-JavaCodeDome\SparkSqlCode\sparksql-train\data\people.json")
    
         people.printSchema()  // 查看DF的内部结构:列名、列的数据类型、是否可以为空
    
        people.show() // 展示出DF内部的数据
    
        // TODO... DF里面有两列,只要name列 ==> select name from people
        people.select("name").show()
        people.select($"name").show()
    
        // TODO...  select * from people where age > 21
        people.filter($"age" > 21).show()
        people.filter("age > 21").show()
    
        // TODO... select age, count(1) from people group by age
        people.groupBy("age").count().show()
    
        // TODO... select name,age+10 from people
         people.select($"name", ($"age"+10).as("new_age")).show()
    
    
        // TODO... 使用SQL的方式操作
        people.createOrReplaceTempView("people")
        spark.sql("select name from people where age > 21").show()
    
    
        val zips: DataFrame = spark.read.json("E:\06-work\03-java\01-JavaCodeDome\SparkSqlCode\sparksql-train\data\zips.json")
        zips.printSchema()  // 查看schema信息
    
        /**
          * 1)loc的信息没用展示全,超过一定长度就使用...来展示
          * 2)只显示了前20条
          * show() ==> show(20) ==> show(numRows, truncate = true)
          */
        zips.show(10, false)
    
        zips.head(3).foreach(println)
        zips.first()
        zips.take(5)
    
        val count: Long = zips.count()
        println(s"Total Counts: $count")
    
        // 过滤出大于40000,withColumnRenamed:字段重新命名
         zips.filter(zips.col("pop") > 40000).withColumnRenamed("_id","new_id").show(10,false)
    
    
        import org.apache.spark.sql.functions._
        // 统计加州pop最多的10个城市名称和ID  desc是一个内置函数
        zips.select("_id","city","pop","state").filter(zips.col("state") === "CA").orderBy(desc("pop")).show(10,false)
    
        zips.createOrReplaceTempView("zips")
        spark.sql("select _id,city,pop,state from zips where state='CA' order by pop desc limit 10").show()
    
    
        spark.stop()
      }
    }
    

      

  • 相关阅读:
    prototype.js超强的javascript类库
    MySQL Server Architecture
    Know more about RBA redo block address
    MySQL无处不在
    利用Oracle Enterprise Manager Cloud Control 12c创建DataGuard Standby
    LAMP Stack
    9i中DG remote archive可能导致Primary Database挂起
    Oracle数据库升级与补丁
    Oracle为何会发生归档日志archivelog大小远小于联机重做日志online redo log size的情况?
    Oracle Ksplice如何工作?How does Ksplice work?
  • 原文地址:https://www.cnblogs.com/yoyo1216/p/13533892.html
Copyright © 2020-2023  润新知