spark 练习 - 润新知

spark 练习

scala> import org.apache.spark.SparkContext

import org.apache.spark.SparkContext

scala> import org.apache.spark.SparkConf

import org.apache.spark.SparkConf

scala> import org.apache.spark.sql.SQLContext

import org.apache.spark.sql.SQLContext

scala> import spark.implicits._

import spark.implicits._

scala> val mysqlcon=new SQLContext(sc)
warning: there was one deprecation warning; re-run with -deprecation for details
mysqlcon: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@3ac76ad9

scala> val mysqldf=mysqlcon.read.format("jdbc").options(Map("url"->"jdbc:mysql://localhost:3306/test","user"->"root","password"->"root","dbtable"->"Account_3004")).load()
mysqldf: org.apache.spark.sql.DataFrame = [AccountName: string, Accid: bigint ... 30 more fields]

scala> mysqldf.printSchema
root
|-- AccountName: string (nullable = false)
|-- Accid: long (nullable = false)
|-- platid: integer (nullable = false)
|-- DateID: integer (nullable = false)
|-- CreateTime: timestamp (nullable = false)
|-- Retention1: integer (nullable = false)
|-- Retention2: integer (nullable = true)
|-- Retention3: integer (nullable = true)
|-- Retention4: integer (nullable = true)
|-- Retention5: integer (nullable = true)
|-- Retention6: integer (nullable = true)
|-- Retention7: integer (nullable = true)
|-- Retention10: integer (nullable = true)
|-- Retention14: integer (nullable = true)
|-- Retention21: integer (nullable = true)
|-- Retention30: integer (nullable = true)
|-- GameID: integer (nullable = false)
|-- id: long (nullable = false)
|-- adcode: string (nullable = true)
|-- AddRMB1: double (nullable = true)
|-- AddRMB2: double (nullable = true)
|-- AddRMB3: double (nullable = true)
|-- AddRMB4: double (nullable = true)
|-- AddRMB5: double (nullable = true)
|-- AddRMB6: double (nullable = true)
|-- AddRMB7: double (nullable = true)
|-- AddRMB10: double (nullable = true)
|-- AddRMB14: double (nullable = true)
|-- AddRMB21: double (nullable = true)
|-- AddRMB30: double (nullable = true)
|-- LoginTimes: integer (nullable = true)
|-- LoginMinutes: integer (nullable = true)

scala> mysqldf.count()
res2: Long = 76813

scala> mysqldf.show(2)
+-----------+--------+------+--------+-------------------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+------+-----+------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+--------+----------+------------+
|AccountName|   Accid|platid| DateID|         CreateTime|Retention1|Retention2|Retention3|Retention4|Retention5|Retention6|Retention7|Retention10|Retention14|Retention21|Retention30|GameID|   id|adcode|AddRMB1|AddRMB2|AddRMB3|AddRMB4|AddRMB5|AddRMB6|AddRMB7|AddRMB10|AddRMB14|AddRMB21|AddRMB30|LoginTimes|LoginMinutes|
+-----------+--------+------+--------+-------------------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+------+-----+------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+--------+----------+------------+
|           | 1004210|     6|20180116|2018-01-16 10:39:50|         1|         0|         0|         0|         0|         0|         0|          0|          0|          0|          0| 3004|22438|      |    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|     0.0|     0.0|     0.0|     0.0|         1|           7|
|           |20946754|     0|20170913|2017-09-13 10:02:37|         1|         0|         0|         1|         0|         0|         0|          0|          0|          0|          0| 3004| 167|      |    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|     0.0|     0.0|     0.0|     0.0|         3|         219|
+-----------+--------+------+--------+-------------------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+------+-----+------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+--------+----------+------------+
only showing top 2 rows

scala> mysqldf.select("accid").show(2)
+--------+
|   accid|
+--------+
|20964769|
|22235886|
+--------+
only showing top 2 rows

scala> mysqldf.select("accid","platid").show(2)
+--------+------+
|   accid|platid|
+--------+------+
| 1004210|     6|
|20946754|     0|
+--------+------+
only showing top 2 rows

scala> mysqldf.filter($"dateid">20180510).count
res9: Long = 5101

scala> mysqldf.select($"accid",$"platid"+1000).show(2)
+--------+---------------+
|   accid|(platid + 1000)|
+--------+---------------+
| 1004210|           1006|
|20946754|           1000|
+--------+---------------+
only showing top 2 rows

scala> mysqldf.groupBy("platid").count().show
+------+-----+
|platid|count|
+------+-----+
|    27| 7157|
|    93|   44|
|   291|   10|
|     1| 8503|
|    13| 290|
|     6| 4765|
|     3| 3281|
|   295|    2|
| 10000|    1|
|   191| 758|
| 24294|    9|
|    19| 1549|
|    15| 8838|
|    17|    6|
|     9| 365|
|   286|    1|
|    35| 4075|
|     4|10395|
|   247|    1|
|   277| 453|
+------+-----+
only showing top 20 rows

scala> mysqldf.filter($"dateid">20180520).groupBy("platid").count().show
+------+-----+
|platid|count|
+------+-----+
|    27| 131|
|    93|   14|
|   291|    2|
|     1| 333|
|    13|   25|
|     6| 116|
|     3|   36|
|   191| 136|
| 24294|    2|
|    19|   39|
|    15| 978|
|     9|    2|
|    35|   72|
|     4| 161|
|   277|   11|
|    50|    8|
|    38|    4|
|   289|   12|
|    21|   24|
|    60|   75|
+------+-----+
only showing top 20 rows

scala> mysqldf.createOrReplaceTempView("account")

scala> val sqldf=spark.sql("select platid,accid,dateid from account where dateid>=20180601" )
sqldf: org.apache.spark.sql.DataFrame = [platid: int, accid: bigint ... 1 more field]

scala> sqldf.show(2)
+------+--------+--------+
|platid|   accid| dateid|
+------+--------+--------+
|     0|22514097|20180601|
|    36|22857594|20180601|
+------+--------+--------+
only showing top 2 rows

------------------------------------

Spark SQL中的临时性视图在会话范围内，如果创建会话的会话终止，它们将消失。如果您希望拥有一个在所有会话中共享的临时视图，并在Spark应用程序终止之前保持活动状态，您可以创建一个全局临时视图。全局临时视图与系统保存的数据库global_temp绑定，我们必须使用限定名来引用它，例如，从global_temp.view1中选择*。

--------------------------------------

scala> mysqldf.createOrReplaceGlobalTempView("tb_acc")

scala> val globaldf=spark.sql("select platid,accid,dateid from global_temp.tb_acc where dateid>=20180601" )
globaldf: org.apache.spark.sql.DataFrame = [platid: int, accid: bigint ... 1 more field]

scala> globaldf.show(2)
+------+--------+--------+
|platid|   accid| dateid|
+------+--------+--------+
|     0|22514097|20180601|
|    36|22857594|20180601|
+------+--------+--------+
only showing top 2 rows

--------------------------

Datasets are similar to RDDs, however, instead of using Java serialization or Kryo they use a specialized Encoder to serialize the objects for processing or transmitting over the network. While both encoders and standard serialization are responsible for turning an object into bytes, encoders are code generated dynamically and use a format that allows Spark to perform many operations like filtering, sorting and hashing without deserializing the bytes back into an object.

但是，数据集类似于RDDs，而不是使用Java序列化或Kryo，而是使用专门的编码器将对象序列化，以便在网络上进行处理或传输。尽管编码器和标准序列化都负责将对象转换成字节，编码器是动态生成的代码，并使用允许Spark执行许多操作(如过滤、排序和散列)的格式，而不会将字节反序列化为对象。

----------------------------------------------------------------

scala> val df1=spark.sql("select distinct platid,dateid from account where dateid>=20180601" )
df1: org.apache.spark.sql.DataFrame = [platid: int, dateid: int]

scala> val ds=df1.toDF
ds: org.apache.spark.sql.DataFrame = [platid: int, dateid: int]

scala> mysqldf.where("dateid>20180601").count()

res36: Long = 2249

scala> mysqldf.filter("dateid>20180601").count()

res37: Long = 2249

scala> mysqldf.apply("accid")
res38: org.apache.spark.sql.Column = accid

scala> mysqldf.filter("dateid>20180601").orderBy(mysqldf("dateid")).show 顺序

scala>mysqldf.filter("dateid>20180601").orderBy(-mysqldf("dateid")).show 逆序

scala> mysqldf.filter("dateid>20180601").orderBy(mysqldf("dateid").desc).show 逆序

scala> mysqldf.groupBy("platid").agg(max("dateid"),min("dateid")).show(2)
+------+-----------+-----------+
|platid|max(dateid)|min(dateid)|
+------+-----------+-----------+
|    27|   20180619|   20170906|
|    93|   20180615|   20180314|
+------+-----------+-----------+
only showing top 2 rows
相关阅读:
noip2011 总结
 noip2010 总结
 noip2009 总结
 noip2008 总结
 noip2006总结
 noip2007 总结
 noip2006 总结
 【模板】线段树模板
 【数学】8.30题解-count数页码
 【数论】8.30题解-prime素数密度洛谷p1835
原文地址：https://www.cnblogs.com/playforever/p/9204045.html