一:Sparksql列操作
初始化SparkContext及数据:
import java.util.Arrays
import org.apache.spark.SparkConf
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.{DataFrame, Row, SparkSession, functions}
import org.apache.spark.sql.functions.{col, desc, length, row_number, trim, when}
import org.apache.spark.sql.functions.{countDistinct,sum,count,avg,lit}
import org.apache.spark.sql.functions.concat
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SaveMode
import java.util.ArrayList
object WordCount {
def initSparkAndData() : DataFrame = {
val sparkSession= SparkSession.builder().master("local").appName("AppName").getOrCreate()
val javasc = new JavaSparkContext(sparkSession.sparkContext)
val nameRDD = javasc.parallelize(Arrays.asList("{'name':'wangwu','age':'18','vip':'t'}",
"{'name':'sunliu','age':'19','vip':'t'}","{'name':'zhangsan','age':'20','vip':'f'}"));
val namedf = sparkSession.read.json(nameRDD)
namedf
}
}
增加列
val data = initSparkAndData()
//方法一:可以添加常量值
data.select(when(col("name").isNotNull,1).otherwise(0) as "usergroup").show(100)
//方法二: 只能已经存在的列操作
data.withColumn("time", concat(col("age"),col("name")) ).show(100)
data.withColumn("time", lit("aa")).show(100) //常量
删除列
val data = initSparkAndData()
data.drop("vip").show(100)
二:SparkSql 窗口函数
传统数据库中就有这个函数,就是partation by () order by ()。那下面让我们看看sparksql中怎么写:
val data = initSparkAndData()
data.withColumn("isVsip", row_number().over(Window.partitionBy(col("vip")).orderBy(desc("name")))).show(100)
上面的意思是按照VIP分组,后按照name排序,作为新的列isVsip。项目中用来作为提取最新记录的函数,举例如下:
统计出用户最近登录记录:
val sparkSession= SparkSession.builder().master("local").appName("AppName").getOrCreate()
val javasc = new JavaSparkContext(sparkSession.sparkContext)
val nameRDD1 = javasc.parallelize(Arrays.asList("{'name':'wangwu','time':'2019-08-12'}",
"{'name':'sunliu','time':'2019-08-13'}","{'name':'zhangsan','time':'2019-08-14'}"));
val namedf1 = sparkSession.read.json(nameRDD1)
val nameRDD2 = javasc.parallelize(Arrays.asList("{'name':'wangwu','time':'2019-09-12'}",
"{'name':'sunliu','time':'2019-08-13'}","{'name':'zhangsan','time':'2019-07-14'}"));
val namedf2 = sparkSession.read.json(nameRDD2)
//上面全是构造数据。
namedf1.union(namedf2).withColumn("max_time", row_number().over(Window.partitionBy(col("name")).orderBy(desc("time"))))
.filter(col("max_time") ===1).show(100)
三:Sparksql join操作
初始化SparkContext及数据:
import java.util.Arrays
import org.apache.spark.SparkConf
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.{DataFrame, Row, SparkSession, functions}
import org.apache.spark.sql.functions.{col, desc, length, row_number, trim, when}
import org.apache.spark.sql.functions.{countDistinct,sum,count,avg}
import org.apache.spark.sql.functions.concat
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SaveMode
import java.util.ArrayList
object WordCount {
def joinTestData() = {
val sparkSession= SparkSession.builder().master("local").appName("AppName").getOrCreate()
val javasc = new JavaSparkContext(sparkSession.sparkContext)
val nameRDD = javasc.parallelize(Arrays.asList("{'name':'zhangsan','age':'18','sex':'N'}", "{'name':'lisi','age':'19','sex':'F'}","{'':'','':'','':''}"));
val nameRDD1 = javasc.parallelize(Arrays.asList("{'name':'wangwu','age':'18','vip':'t'}", "{'name':'sunliu','age':'19','vip':'t'}","{'name':'zhangsan','age':'18','vip':'f'}"));
val data1 = sparkSession.read.json(nameRDD)
val data2 = sparkSession.read.json(nameRDD1)
(data1,data2)
}
}
left、leftouter、left_outer三者相同
val dataTuple = joinTestData()
val data1 = dataTuple._1
val data2 = dataTuple._2
val left = data1.join(data2,data1("name") === data2("name") ,"left").show(100)
结果:
age | name | sex | age | name | vip |
null | null | null | null | null | null |
18 | zhangsan | N | 18 | zhangsan | f |
19 | lisi | f | null | null | null |
right、rightouter、right_outer三者相同
val dataTuple = joinTestData()
val data1 = dataTuple._1
val data2 = dataTuple._2
val right = data1.join(data2,data1("name") === data2("name") ,"right").show(100)
结果:
age | name | sex | age | name | vip |
null | null | null | 18 | wangwu | t |
18 | zhangsan | N | 18 | zhangsan | f |
null | null | null | sunliu | t |
cross、inner两者相同
val dataTuple = joinTestData()
val data1 = dataTuple._1
val data2 = dataTuple._2
val right = data1.join(data2,data1("name") === data2("name") ,"right").show(100)
结果:
age | name | sex | age | name | vip |
18 | zhangsan | N | 18 | zhangsan | f |
full、fullouter、full_outer、outer四者相同
val dataTuple = joinTestData()
val data1 = dataTuple._1
val data2 = dataTuple._2
val full = data1.join(data2,data1("name") === data2("name") ,"full").show(100)
结果:
age | name | sex | age | name | vip |
null | null | null | 18 | wangwu | t |
null | null | null | null | null | null |
18 | zhangsan | N | 18 | zhangsan | f |
null | null | null | 19 | sunliu | t |
19 | lisi | F | null | null | null |
leftsemi(innerjoin之后只保留左边的)
val dataTuple = joinTestData()
val data1 = dataTuple._1
val data2 = dataTuple._2
val leftsemi = data1.join(data2,data1("name") === data2("name") ,"leftsemi").show(100)
真正在项目中的使用:项目中有一张大表,主键是用户ID,里面有用户所有基本信息。项目使用过程中一般要求关联大表取得所有基本信息,leftsemi一般用于缩减大表。
结果:
age | name | sex |
18 | zhangsan | N |
leftanti(innerjoin之后去除能关联上之外的)
val dataTuple = joinTestData()
val data1 = dataTuple._1
val data2 = dataTuple._2
val leftouter = data1.join(data2,data1("name") === data2("name") ,"leftanti").show(100)
结果:
age | name | sex |
null | null | null |
19 | lisi | F |