• spark 练习


    scala> import org.apache.spark.SparkContext

    import org.apache.spark.SparkContext

    scala> import org.apache.spark.SparkConf

    import org.apache.spark.SparkConf

    scala> import org.apache.spark.sql.SQLContext

    import org.apache.spark.sql.SQLContext

    scala> import spark.implicits._

    import spark.implicits._

    scala> val mysqlcon=new SQLContext(sc)
    warning: there was one deprecation warning; re-run with -deprecation for details
    mysqlcon: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@3ac76ad9

    scala> val mysqldf=mysqlcon.read.format("jdbc").options(Map("url"->"jdbc:mysql://localhost:3306/test","user"->"root","password"->"root","dbtable"->"Account_3004")).load()
    mysqldf: org.apache.spark.sql.DataFrame = [AccountName: string, Accid: bigint ... 30 more fields]

    scala> mysqldf.printSchema
    root
     |-- AccountName: string (nullable = false)
     |-- Accid: long (nullable = false)
     |-- platid: integer (nullable = false)
     |-- DateID: integer (nullable = false)
     |-- CreateTime: timestamp (nullable = false)
     |-- Retention1: integer (nullable = false)
     |-- Retention2: integer (nullable = true)
     |-- Retention3: integer (nullable = true)
     |-- Retention4: integer (nullable = true)
     |-- Retention5: integer (nullable = true)
     |-- Retention6: integer (nullable = true)
     |-- Retention7: integer (nullable = true)
     |-- Retention10: integer (nullable = true)
     |-- Retention14: integer (nullable = true)
     |-- Retention21: integer (nullable = true)
     |-- Retention30: integer (nullable = true)
     |-- GameID: integer (nullable = false)
     |-- id: long (nullable = false)
     |-- adcode: string (nullable = true)
     |-- AddRMB1: double (nullable = true)
     |-- AddRMB2: double (nullable = true)
     |-- AddRMB3: double (nullable = true)
     |-- AddRMB4: double (nullable = true)
     |-- AddRMB5: double (nullable = true)
     |-- AddRMB6: double (nullable = true)
     |-- AddRMB7: double (nullable = true)
     |-- AddRMB10: double (nullable = true)
     |-- AddRMB14: double (nullable = true)
     |-- AddRMB21: double (nullable = true)
     |-- AddRMB30: double (nullable = true)
     |-- LoginTimes: integer (nullable = true)
     |-- LoginMinutes: integer (nullable = true)


    scala> mysqldf.count()
    res2: Long = 76813

    scala> mysqldf.show(2)
    +-----------+--------+------+--------+-------------------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+------+-----+------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+--------+----------+------------+
    |AccountName|   Accid|platid|  DateID|         CreateTime|Retention1|Retention2|Retention3|Retention4|Retention5|Retention6|Retention7|Retention10|Retention14|Retention21|Retention30|GameID|   id|adcode|AddRMB1|AddRMB2|AddRMB3|AddRMB4|AddRMB5|AddRMB6|AddRMB7|AddRMB10|AddRMB14|AddRMB21|AddRMB30|LoginTimes|LoginMinutes|
    +-----------+--------+------+--------+-------------------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+------+-----+------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+--------+----------+------------+
    |           | 1004210|     6|20180116|2018-01-16 10:39:50|         1|         0|         0|         0|         0|         0|         0|          0|          0|          0|          0|  3004|22438|      |    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|     0.0|     0.0|     0.0|     0.0|         1|           7|
    |           |20946754|     0|20170913|2017-09-13 10:02:37|         1|         0|         0|         1|         0|         0|         0|          0|          0|          0|          0|  3004|  167|      |    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|     0.0|     0.0|     0.0|     0.0|         3|         219|
    +-----------+--------+------+--------+-------------------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+------+-----+------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+--------+----------+------------+
    only showing top 2 rows

    scala> mysqldf.select("accid").show(2)
    +--------+
    |   accid|
    +--------+
    |20964769|
    |22235886|
    +--------+
    only showing top 2 rows


    scala> mysqldf.select("accid","platid").show(2)
    +--------+------+
    |   accid|platid|
    +--------+------+
    | 1004210|     6|
    |20946754|     0|
    +--------+------+
    only showing top 2 rows


    scala> mysqldf.filter($"dateid">20180510).count
    res9: Long = 5101

    scala> mysqldf.select($"accid",$"platid"+1000).show(2)
    +--------+---------------+
    |   accid|(platid + 1000)|
    +--------+---------------+
    | 1004210|           1006|
    |20946754|           1000|
    +--------+---------------+
    only showing top 2 rows

    scala> mysqldf.groupBy("platid").count().show
    +------+-----+
    |platid|count|
    +------+-----+
    |    27| 7157|
    |    93|   44|
    |   291|   10|
    |     1| 8503|
    |    13|  290|
    |     6| 4765|
    |     3| 3281|
    |   295|    2|
    | 10000|    1|
    |   191|  758|
    | 24294|    9|
    |    19| 1549|
    |    15| 8838|
    |    17|    6|
    |     9|  365|
    |   286|    1|
    |    35| 4075|
    |     4|10395|
    |   247|    1|
    |   277|  453|
    +------+-----+
    only showing top 20 rows

    scala> mysqldf.filter($"dateid">20180520).groupBy("platid").count().show
    +------+-----+
    |platid|count|
    +------+-----+
    |    27|  131|
    |    93|   14|
    |   291|    2|
    |     1|  333|
    |    13|   25|
    |     6|  116|
    |     3|   36|
    |   191|  136|
    | 24294|    2|
    |    19|   39|
    |    15|  978|
    |     9|    2|
    |    35|   72|
    |     4|  161|
    |   277|   11|
    |    50|    8|
    |    38|    4|
    |   289|   12|
    |    21|   24|
    |    60|   75|
    +------+-----+
    only showing top 20 rows

    scala> mysqldf.createOrReplaceTempView("account")


    scala> val sqldf=spark.sql("select platid,accid,dateid  from account where dateid>=20180601" )
    sqldf: org.apache.spark.sql.DataFrame = [platid: int, accid: bigint ... 1 more field]


    scala> sqldf.show(2)
    +------+--------+--------+
    |platid|   accid|  dateid|
    +------+--------+--------+
    |     0|22514097|20180601|
    |    36|22857594|20180601|
    +------+--------+--------+
    only showing top 2 rows

    ------------------------------------

    Spark SQL中的临时性视图在会话范围内,如果创建会话的会话终止,它们将消失。如果您希望拥有一个在所有会话中共享的临时视图,并在Spark应用程序终止之前保持活动状态,您可以创建一个全局临时视图。全局临时视图与系统保存的数据库global_temp绑定,我们必须使用限定名来引用它,例如,从global_temp.view1中选择*。

    --------------------------------------

    scala> mysqldf.createOrReplaceGlobalTempView("tb_acc")

    scala> val globaldf=spark.sql("select platid,accid,dateid  from global_temp.tb_acc where dateid>=20180601" )
    globaldf: org.apache.spark.sql.DataFrame = [platid: int, accid: bigint ... 1 more field]

    scala> globaldf.show(2)
    +------+--------+--------+
    |platid|   accid|  dateid|
    +------+--------+--------+
    |     0|22514097|20180601|
    |    36|22857594|20180601|
    +------+--------+--------+
    only showing top 2 rows

    --------------------------

    Datasets are similar to RDDs, however, instead of using Java serialization or Kryo they use a specialized Encoder to serialize the objects for processing or transmitting over the network. While both encoders and standard serialization are responsible for turning an object into bytes, encoders are code generated dynamically and use a format that allows Spark to perform many operations like filtering, sorting and hashing without deserializing the bytes back into an object.

    但是,数据集类似于RDDs,而不是使用Java序列化或Kryo,而是使用专门的编码器将对象序列化,以便在网络上进行处理或传输。尽管编码器和标准序列化都负责将对象转换成字节,编码器是动态生成的代码,并使用允许Spark执行许多操作(如过滤、排序和散列)的格式,而不会将字节反序列化为对象。

    ----------------------------------------------------------------

    scala> val df1=spark.sql("select distinct platid,dateid  from account where dateid>=20180601" )
    df1: org.apache.spark.sql.DataFrame = [platid: int, dateid: int]


    scala> val ds=df1.toDF
    ds: org.apache.spark.sql.DataFrame = [platid: int, dateid: int]

    scala> mysqldf.where("dateid>20180601").count()

    res36: Long = 2249

    scala> mysqldf.filter("dateid>20180601").count()

    res37: Long = 2249

    scala> mysqldf.apply("accid")
    res38: org.apache.spark.sql.Column = accid

    scala> mysqldf.filter("dateid>20180601").orderBy(mysqldf("dateid")).show 顺序

     scala>mysqldf.filter("dateid>20180601").orderBy(-mysqldf("dateid")).show  逆序

    scala> mysqldf.filter("dateid>20180601").orderBy(mysqldf("dateid").desc).show 逆序

    scala> mysqldf.groupBy("platid").agg(max("dateid"),min("dateid")).show(2)
    +------+-----------+-----------+
    |platid|max(dateid)|min(dateid)|
    +------+-----------+-----------+
    |    27|   20180619|   20170906|
    |    93|   20180615|   20180314|
    +------+-----------+-----------+
    only showing top 2 rows

  • 相关阅读:
    noip2011 总结
    noip2010 总结
    noip2009 总结
    noip2008 总结
    noip2006总结
    noip2007 总结
    noip2006 总结
    【模板】线段树模板
    【数学】8.30题解-count数页码
    【数论】8.30题解-prime素数密度 洛谷p1835
  • 原文地址:https://www.cnblogs.com/playforever/p/9204045.html
Copyright © 2020-2023  润新知