• spark 练习


    scala> import org.apache.spark.SparkContext

    import org.apache.spark.SparkContext

    scala> import org.apache.spark.SparkConf

    import org.apache.spark.SparkConf

    scala> import org.apache.spark.sql.SQLContext

    import org.apache.spark.sql.SQLContext

    scala> import spark.implicits._

    import spark.implicits._

    scala> val mysqlcon=new SQLContext(sc)
    warning: there was one deprecation warning; re-run with -deprecation for details
    mysqlcon: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@3ac76ad9

    scala> val mysqldf=mysqlcon.read.format("jdbc").options(Map("url"->"jdbc:mysql://localhost:3306/test","user"->"root","password"->"root","dbtable"->"Account_3004")).load()
    mysqldf: org.apache.spark.sql.DataFrame = [AccountName: string, Accid: bigint ... 30 more fields]

    scala> mysqldf.printSchema
    root
     |-- AccountName: string (nullable = false)
     |-- Accid: long (nullable = false)
     |-- platid: integer (nullable = false)
     |-- DateID: integer (nullable = false)
     |-- CreateTime: timestamp (nullable = false)
     |-- Retention1: integer (nullable = false)
     |-- Retention2: integer (nullable = true)
     |-- Retention3: integer (nullable = true)
     |-- Retention4: integer (nullable = true)
     |-- Retention5: integer (nullable = true)
     |-- Retention6: integer (nullable = true)
     |-- Retention7: integer (nullable = true)
     |-- Retention10: integer (nullable = true)
     |-- Retention14: integer (nullable = true)
     |-- Retention21: integer (nullable = true)
     |-- Retention30: integer (nullable = true)
     |-- GameID: integer (nullable = false)
     |-- id: long (nullable = false)
     |-- adcode: string (nullable = true)
     |-- AddRMB1: double (nullable = true)
     |-- AddRMB2: double (nullable = true)
     |-- AddRMB3: double (nullable = true)
     |-- AddRMB4: double (nullable = true)
     |-- AddRMB5: double (nullable = true)
     |-- AddRMB6: double (nullable = true)
     |-- AddRMB7: double (nullable = true)
     |-- AddRMB10: double (nullable = true)
     |-- AddRMB14: double (nullable = true)
     |-- AddRMB21: double (nullable = true)
     |-- AddRMB30: double (nullable = true)
     |-- LoginTimes: integer (nullable = true)
     |-- LoginMinutes: integer (nullable = true)


    scala> mysqldf.count()
    res2: Long = 76813

    scala> mysqldf.show(2)
    +-----------+--------+------+--------+-------------------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+------+-----+------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+--------+----------+------------+
    |AccountName|   Accid|platid|  DateID|         CreateTime|Retention1|Retention2|Retention3|Retention4|Retention5|Retention6|Retention7|Retention10|Retention14|Retention21|Retention30|GameID|   id|adcode|AddRMB1|AddRMB2|AddRMB3|AddRMB4|AddRMB5|AddRMB6|AddRMB7|AddRMB10|AddRMB14|AddRMB21|AddRMB30|LoginTimes|LoginMinutes|
    +-----------+--------+------+--------+-------------------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+------+-----+------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+--------+----------+------------+
    |           | 1004210|     6|20180116|2018-01-16 10:39:50|         1|         0|         0|         0|         0|         0|         0|          0|          0|          0|          0|  3004|22438|      |    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|     0.0|     0.0|     0.0|     0.0|         1|           7|
    |           |20946754|     0|20170913|2017-09-13 10:02:37|         1|         0|         0|         1|         0|         0|         0|          0|          0|          0|          0|  3004|  167|      |    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|     0.0|     0.0|     0.0|     0.0|         3|         219|
    +-----------+--------+------+--------+-------------------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+------+-----+------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+--------+----------+------------+
    only showing top 2 rows

    scala> mysqldf.select("accid").show(2)
    +--------+
    |   accid|
    +--------+
    |20964769|
    |22235886|
    +--------+
    only showing top 2 rows


    scala> mysqldf.select("accid","platid").show(2)
    +--------+------+
    |   accid|platid|
    +--------+------+
    | 1004210|     6|
    |20946754|     0|
    +--------+------+
    only showing top 2 rows


    scala> mysqldf.filter($"dateid">20180510).count
    res9: Long = 5101

    scala> mysqldf.select($"accid",$"platid"+1000).show(2)
    +--------+---------------+
    |   accid|(platid + 1000)|
    +--------+---------------+
    | 1004210|           1006|
    |20946754|           1000|
    +--------+---------------+
    only showing top 2 rows

    scala> mysqldf.groupBy("platid").count().show
    +------+-----+
    |platid|count|
    +------+-----+
    |    27| 7157|
    |    93|   44|
    |   291|   10|
    |     1| 8503|
    |    13|  290|
    |     6| 4765|
    |     3| 3281|
    |   295|    2|
    | 10000|    1|
    |   191|  758|
    | 24294|    9|
    |    19| 1549|
    |    15| 8838|
    |    17|    6|
    |     9|  365|
    |   286|    1|
    |    35| 4075|
    |     4|10395|
    |   247|    1|
    |   277|  453|
    +------+-----+
    only showing top 20 rows

    scala> mysqldf.filter($"dateid">20180520).groupBy("platid").count().show
    +------+-----+
    |platid|count|
    +------+-----+
    |    27|  131|
    |    93|   14|
    |   291|    2|
    |     1|  333|
    |    13|   25|
    |     6|  116|
    |     3|   36|
    |   191|  136|
    | 24294|    2|
    |    19|   39|
    |    15|  978|
    |     9|    2|
    |    35|   72|
    |     4|  161|
    |   277|   11|
    |    50|    8|
    |    38|    4|
    |   289|   12|
    |    21|   24|
    |    60|   75|
    +------+-----+
    only showing top 20 rows

    scala> mysqldf.createOrReplaceTempView("account")


    scala> val sqldf=spark.sql("select platid,accid,dateid  from account where dateid>=20180601" )
    sqldf: org.apache.spark.sql.DataFrame = [platid: int, accid: bigint ... 1 more field]


    scala> sqldf.show(2)
    +------+--------+--------+
    |platid|   accid|  dateid|
    +------+--------+--------+
    |     0|22514097|20180601|
    |    36|22857594|20180601|
    +------+--------+--------+
    only showing top 2 rows

    ------------------------------------

    Spark SQL中的临时性视图在会话范围内,如果创建会话的会话终止,它们将消失。如果您希望拥有一个在所有会话中共享的临时视图,并在Spark应用程序终止之前保持活动状态,您可以创建一个全局临时视图。全局临时视图与系统保存的数据库global_temp绑定,我们必须使用限定名来引用它,例如,从global_temp.view1中选择*。

    --------------------------------------

    scala> mysqldf.createOrReplaceGlobalTempView("tb_acc")

    scala> val globaldf=spark.sql("select platid,accid,dateid  from global_temp.tb_acc where dateid>=20180601" )
    globaldf: org.apache.spark.sql.DataFrame = [platid: int, accid: bigint ... 1 more field]

    scala> globaldf.show(2)
    +------+--------+--------+
    |platid|   accid|  dateid|
    +------+--------+--------+
    |     0|22514097|20180601|
    |    36|22857594|20180601|
    +------+--------+--------+
    only showing top 2 rows

    --------------------------

    Datasets are similar to RDDs, however, instead of using Java serialization or Kryo they use a specialized Encoder to serialize the objects for processing or transmitting over the network. While both encoders and standard serialization are responsible for turning an object into bytes, encoders are code generated dynamically and use a format that allows Spark to perform many operations like filtering, sorting and hashing without deserializing the bytes back into an object.

    但是,数据集类似于RDDs,而不是使用Java序列化或Kryo,而是使用专门的编码器将对象序列化,以便在网络上进行处理或传输。尽管编码器和标准序列化都负责将对象转换成字节,编码器是动态生成的代码,并使用允许Spark执行许多操作(如过滤、排序和散列)的格式,而不会将字节反序列化为对象。

    ----------------------------------------------------------------

    scala> val df1=spark.sql("select distinct platid,dateid  from account where dateid>=20180601" )
    df1: org.apache.spark.sql.DataFrame = [platid: int, dateid: int]


    scala> val ds=df1.toDF
    ds: org.apache.spark.sql.DataFrame = [platid: int, dateid: int]

    scala> mysqldf.where("dateid>20180601").count()

    res36: Long = 2249

    scala> mysqldf.filter("dateid>20180601").count()

    res37: Long = 2249

    scala> mysqldf.apply("accid")
    res38: org.apache.spark.sql.Column = accid

    scala> mysqldf.filter("dateid>20180601").orderBy(mysqldf("dateid")).show 顺序

     scala>mysqldf.filter("dateid>20180601").orderBy(-mysqldf("dateid")).show  逆序

    scala> mysqldf.filter("dateid>20180601").orderBy(mysqldf("dateid").desc).show 逆序

    scala> mysqldf.groupBy("platid").agg(max("dateid"),min("dateid")).show(2)
    +------+-----------+-----------+
    |platid|max(dateid)|min(dateid)|
    +------+-----------+-----------+
    |    27|   20180619|   20170906|
    |    93|   20180615|   20180314|
    +------+-----------+-----------+
    only showing top 2 rows

  • 相关阅读:
    gitlab的数据库磁盘坏了已经没有办法恢复情况下如何恢复git上的代码
    psql: FATAL: the database system is in recovery mode
    k8s 下 jenkins 分布式部署:利用pipeline动态增加slave节点
    pipeline 流水线:持续部署(docker)-企业微信群通知消息
    查看私有仓库镜像的版本列表
    MyBatis与Hibernate比较
    MyBatis与JDBC的对比
    Java_Ant详解(转载)
    IntelliJ Idea 常用快捷键列表
    隔行换色
  • 原文地址:https://www.cnblogs.com/playforever/p/9204045.html
Copyright © 2020-2023  润新知