• spark sql 操作


    DSL风格语法

    1、查看DataFrame中的内容

    scala> df1.show
    +---+--------+---+
    | id| name|age|
    +---+--------+---+
    | 1|zhansgan| 16|
    | 2| lisi| 18|
    | 3| wangwu| 21|
    | 4|xiaofang| 22|
    +---+--------+---+

    2、查看DataFrame部分列的数据

    scala> df1.select(df1.col("name")).show
    +--------+
    | name|
    +--------+
    |zhansgan|
    | lisi|
    | wangwu|
    |xiaofang|
    +--------+
    

      

    scala> df1.select(col("name"), col("age")).show
    +--------+---+
    | name|age|
    +--------+---+
    |zhansgan| 16|
    | lisi| 18|
    | wangwu| 21|
    |xiaofang| 22|
    +--------+---+
    scala> df1.select("name").show
    +--------+
    | name|
    +--------+
    |zhansgan|
    | lisi|
    | wangwu|
    |xiaofang|
    +--------+


    3、查看DataFrame schema信息

    scala> df1.printSchema
    root
    |-- id: integer (nullable = false)
    |-- name: string (nullable = true)
    |-- age: integer (nullable = false)
    

    4、查询name和age并将age + 1

    scala> df1.select(col("name"), col("age") + 1).show
    +--------+---------+
    | name|(age + 1)|
    +--------+---------+
    |zhansgan| 17|
    | lisi| 19|
    | wangwu| 22|
    |xiaofang| 23|
    +--------+---------+
    

      

    scala> df1.select(df1("name"), df1("age") + 1).show
    +--------+---------+
    | name|(age + 1)|
    +--------+---------+
    |zhansgan| 17|
    | lisi| 19|
    | wangwu| 22|
    |xiaofang| 23|
    +--------+---------+


    5、过滤年龄大于20的人

    scala> df1.filter(col("age") > 20).show
    +---+--------+---+
    | id| name|age|
    +---+--------+---+
    | 3| wangwu| 21|
    | 4|xiaofang| 22|
    +---+--------+---+
    

      

    6、按年龄分组,并统计年龄相同的人数

    scala> df1.groupBy("age").count().show
    +---+-----+ 
    |age|count|
    +---+-----+
    | 16| 1|
    | 18| 1|
    | 21| 1|
    | 22| 1|
    +---+-----+
    

      

    SQL风格

    在使用SQL风格前,首先需要将DataFrame注册成表

    df1.registerTempTable("t_person")
    

    1、查询年龄最大的前两个人

    scala> sqlContext.sql("select * from t_person order by age desc limit 2").show
    +---+--------+---+
    | id| name|age|
    +---+--------+---+
    | 4|xiaofang| 22|
    | 3| wangwu| 21|
    +---+--------+---+
    

      

    2、显示表的schema信息

    scala> sqlContext.sql("desc t_person").show
    +--------+---------+-------+
    |col_name|data_type|comment|
    +--------+---------+-------+
    | id| int| |
    | name| string| |
    | age| int| |
    +--------+---------+-------+
    

      

    DataFrame api 操作

    package bigdata.spark.sql
    
    import org.apache.spark.sql.SQLContext
    import org.apache.spark.{SparkContext, SparkConf}
    
    import scala.reflect.internal.util.TableDef.Column
    
    /**
      * Created by Administrator on 2017/4/27.
      */
    object SparkSqlDemo {
    
      def main(args: Array[String]) {
        val conf = new SparkConf()
        conf.setAppName("SparkSqlDemo")
        conf.setMaster("local")
        val sc = new SparkContext(conf)
        val sqlContext = new SQLContext(sc)
        val rdd1 = sc.textFile("hdfs://m1:9000/persons.txt").map(_.split(" "))
        val rdd2 = rdd1.map(x => Person(x(0).toInt, x(1), x(2).toInt))
    
        // 导入隐式转换,里面包含了RDD隐式转换为DataFrame的方法
        import sqlContext.implicits._
        // df1现在已经是DataFrame了
        val df1 = rdd2.toDF
        df1.show
    
    
        df1.select("age").show()
    
        df1.select(col="age").show
        df1.select(df1.col("age")).show
    
        import df1._
        df1.select(col("age")).show
    
        df1.select(col("age") > 20).show
    
        df1.select(col("age") + 1).show
    
        df1.filter(col("age") > 20).show()
    
    
        df1.registerTempTable("t_person")
    
        sqlContext.sql("select * from t_person").show()
    
        sqlContext.sql("select * from t_person order by age desc limit 2").show()
    
        sc.stop()
    
      }
    
      // 这个类必须放在main方法外面,不然的话会报错
      case class Person(id:Int, name:String, age:Int)
    
    }
    

      

    StructType指定Schema

    package bigdata.spark.sql
    
    import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}
    import org.apache.spark.sql.{Row, SQLContext}
    import org.apache.spark.{SparkContext, SparkConf}
    
    import scala.reflect.internal.util.TableDef.Column
    
    /**
      * Created by Administrator on 2017/4/27.
      */
    object SparkSqlDemo {
    
      def main(args: Array[String]) {
        val conf = new SparkConf()
        conf.setAppName("SparkSqlDemo")
        conf.setMaster("local")
        val sc = new SparkContext(conf)
        val sqlContext = new SQLContext(sc)
        val rdd1 = sc.textFile("hdfs://m1:9000/persons.txt").map(_.split(" "))
        val rdd2 = rdd1.map(x => Row(x(0).toInt, x(1), x(2).toInt))
        // 创建schema
        val schema = StructType(
          List(
            // 名称 类型 是否可以为空
            StructField("id", IntegerType, false),
            StructField("name", StringType, false),
            StructField("age", IntegerType, false)
          )
        )
    
        // 创建DataFrame
        val df1 = sqlContext.createDataFrame(rdd2, schema)
    
        df1.registerTempTable("t_person")
    
        sqlContext.sql("select * from t_person").show()
    
        sc.stop()
    
      }
    
    }
    

      

    spark sql操作关系型数据库

    spark sql可以从关系型数据库读入数据创建DataFrame,也可以写数据到关系型数据库

    1、创建数据库

    CREATE DATABASE spark DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci;

    2、创建person表

    create table person(id int, name varchar(200), age int);

    3、spark 操作关系型数据库

    package bigdata.spark.sql
    
    import java.util.Properties
    
    import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}
    import org.apache.spark.sql.{SaveMode, Row, SQLContext}
    import org.apache.spark.{SparkContext, SparkConf}
    
    import scala.reflect.internal.util.TableDef.Column
    
    /**
      * Created by Administrator on 2017/4/27.
      */
    object SparkSqlDemo {
    
      def main(args: Array[String]) {
        val conf = new SparkConf()
        conf.setAppName("SparkSqlDemo")
        conf.setMaster("local")
        val sc = new SparkContext(conf)
        val sqlContext = new SQLContext(sc)
        val rdd1 = sc.textFile("hdfs://m1:9000/persons.txt").map(_.split(" "))
        val rdd2 = rdd1.map(x => Row(x(0).toInt, x(1), x(2).toInt))
        // 创建schema
        val schema = StructType(
          List(
            // 名称 类型 是否可以为空
            StructField("id", IntegerType, false),
            StructField("name", StringType, false),
            StructField("age", IntegerType, false)
          )
        )
    
        val props = new Properties()
        props.put("user", "root")
        props.put("password", "root")
    
        // 创建DataFrame
        val df1 = sqlContext.createDataFrame(rdd2, schema)
    
        // 以追加的模式写入数据库
        df1.write.mode(SaveMode.Append).jdbc("jdbc:mysql://m1:3306/spark", "person", props)
    
    
        // 从数据库中读数据
        sqlContext.read.jdbc("jdbc:mysql://m1:3306/spark", "person", props).show()
    
        sc.stop()
    
      }
    
    }
    

      

  • 相关阅读:
    如何用grunt压缩文件
    nodejs 下载,安装,测试(windows环境下)
    INNODB和MYISAM
    HashMap和HashTable
    平衡二叉树(AVL)
    二叉排序树删除
    二叉排序树的创建和遍历
    二叉排序树基础
    赫夫曼树
    堆排序
  • 原文地址:https://www.cnblogs.com/heml/p/6774193.html
Copyright © 2020-2023  润新知