一、程序
1 package sparklearning
2
3 import org.apache.log4j.Logger
4 import org.apache.spark.SparkConf
5 import org.apache.spark.SparkContext
6 import org.apache.spark.sql.SQLContext
7 import org.apache.spark.storage.StorageLevel
8 import org.apache.log4j.Level
9
10 object OnLineTradeStatistics {
11
12 case class User(userID:String,gender:String,age:Int,registerDate:String,provice:String,career:String)
13 case class TradeDetail(tradeID:String, tradeDate:String,productID:Int,amount:Int,userID:String)
14 def main(args: Array[String]){
15
16 //关闭不必要的日志显示
17 Logger.getLogger("org.apache.hadoop").setLevel(Level.ERROR)
18 Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
19 Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
20
21 //设置应用程序
22 val conf=new SparkConf().setAppName("On Line Trade Data").setMaster("local")
23 val ctx=new SparkContext(conf)
24 val sqlCtx=new SQLContext(ctx)
25 import sqlCtx.implicits._
26
27 //读文件 RDD-->DataFrame
28 val userDF= ctx.textFile("/home/hadoop/data/on_line_trade_user.txt").map(_.split(" ")).map(u=>User(u(0),u(1),u(2).toInt,u(3),u(4),u(5))).toDF()
29 userDF.registerTempTable("user")
30 userDF.persist(StorageLevel.MEMORY_ONLY_SER)
31
32 val tradeDF= ctx.textFile("/home/hadoop/data/on_line_trade_detail.txt").map(_.split(" ")).map(u=>TradeDetail(u(0),u(1),u(2).toInt,u(3).toInt,u(4))).toDF()
33 tradeDF.registerTempTable("trade")//生成临时表
34 tradeDF.persist(StorageLevel.MEMORY_ONLY_SER)
35
36 val countOfTrade2016 = sqlCtx.sql("SELECT * FROM trade where tradeDate like '2016%'").count()
37 println("2016 total money: "+countOfTrade2016)
38 }
39 }
二、结果