package com.xh.movies
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
import org.apache.log4j.{Level,Logger}
* Created by ssss on 3/11/2017.
* need understand what's the relationshop between dataset & RDD
* occupations small data set need to be broadcast
* production env should use Parquet ,but not easy for user to read the contents
* Here we use 4 files below
* 1,"ratings.dat": UserID::MovieID::Rating::Timestamp ///For each threat, you should assign ratings of risk impacts for each asset
* 2,"users.dat": UserID::Gender::Age::OccupationID::Zip-code
* 3,"movies.dat": MovieID::Title::Genres
* 4,"occupations.dat":OccupationID::OccupationName
object MovieReviewsSystemUserBehaviorAnalysis {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.ERROR) //only print error log
var masterUrl = "local[4]" //4 cores default the process run my laptop for study and testing
var dataPath = "data/medium/" //here is where the data saved
* anyway the process should be run on prod in jar mode
* so we need prod master url in case
if (args.length > 1 ){ //here the spark submit need 2 param in prod env at least
masterUrl = args(0)
dataPath = args(1)
// spark conetext musted .create the spark context
val conf = new SparkConf().setMaster(masterUrl).setAppName("userAnalysis")
val sscc = new SparkContext(conf)
/** 0001
* after have the sc we need read data from local or hadoop
* Here we use rdd read file
val usersRDD: RDD[String] = sscc.textFile(dataPath + "users.dat")
val moviesRDD: RDD[String] = sscc.textFile(dataPath + "movies.dat")
val occupationsRDD: RDD[String] = sscc.textFile(dataPath + "occupations.dat")
val ratingsRDD: RDD[String] = sscc.textFile(dataPath + "ratings.dat")
* accroding movie id get user info
val basicUserRDD ="::")).map{
user =>(user(3),(user(0),user(1),user(2)))
val occpuation ="::")).map(job => (job(0),job(1)))
val userInfo = basicUserRDD.join(occpuation)
//userInfo.collect().foreach(println) //result --> occuption id user info ,occupation name (4,((2175,M,25),college/grad student)) works
//ratings.dat": UserID::MovieID
val targetMovie ="::")).map(x => (x(0),x(1))).filter(_._2.equals("1139"))
val targetUser = =>((x._2._1._1),x._2))
val finalInfo = targetMovie.join(targetUser)
//finalInfo.collect().foreach(println) //result (3518,(1139,((3518,F,18),executive/managerial)))
/** 0002
* get the must populate mives from rating file table by key value reduceByKey
* "ratings.dat": UserID::MovieID::Rating::Timestamp
val populateRDD ="::")).map(x => (x(1),x(0),x(2)))
val getTotalPoint = => (x._1,(x._3.toInt,1))) // get the (key ,value ) tuple
.reduceByKey((v1, v2) =>( v1._1+v2._1, v1._2+v2._2)) // operation to reduce // here is so complex and so clever get the total point and total people each movie
.map(x=>(x._2._1 / x._2._2,x._1)) //get the average per people per movie
.sortByKey(false).map(x =>(x._2,x._1)) // desc
.take(15) //get top 10
//getTotalPoint.collect().foreach(println) // result ---> (2329,(1798027,640))
//getTotalPoint.foreach(println ) //(3607,5)
* get the most popule movies mean how many people saww the movie and get top 15
val mustPopulte ="::")).map(x => (x(1),1)).reduceByKey(_+_).sortBy(_._2,false).take(15)
mustPopulte.foreach(println) //(2858,3428) (260,2991)
* calculate the top 10 movies by gender ,we can not get the data from rating ,so we need to join user table .need
* aggergate opeartion
* mapjoin shuffle is the killer in distribute system ,while mapjoin will not cause shuffle
* but what's the mapjoin ? maybe broadcast , which a small table or file
* 1,"ratings.dat": UserID::MovieID::Rating::Timestamp
* 2,"users.dat": UserID::Gender::Age::OccupationID::Zip-code
//first create a wide table from rating and users
val spiltUser"::")).map(x=>(x(0),x(1),x(2),x(3)))
val spiltRating ="::")).map(x=>(x(0),x(1),x(2),x(3)))
val wdie = x=> (x._1,x._2))
val genderWideInfo => (x._1,(x._1,x._2,x._3))).join(wdie)
//genderWideInfo.take(10).foreach(println) (2828,((2828,3948,5),M))
val maleWideInfo = genderWideInfo.filter(x => x._2._2.equals("M")).map(x => (x._2._1._2,(x._2._1._3.toDouble,1))) //moveid ,rating point people number
.reduceByKey((v1, v2) =>( v1._1+v2._1, v1._2+v2._2))
.map(x=>(x._2._1.toDouble / x._2._2,x._1))
.sortByKey(false).map(x =>(x._2,x._1)) // desc
.take(15) //get top 10
val fmaleWideInfo = genderWideInfo.filter(x => x._2._2.equals("F")).map(x => (x._2._1._2,(x._2._1._3.toDouble,1))) //moveid ,rating point people number
.reduceByKey((v1, v2) =>( v1._1+v2._1, v1._2+v2._2))
.map(x=>(x._2._1.toDouble / x._2._2,x._1))
.sortByKey(false).map(x =>(x._2,x._1)) // desc
.take(15) //get top 10
// get the ages top n number ex. 20-29 ,30-39 so on ,so group by maybe need filter
//age has been etl
* - Age is chosen from the following ranges:
* 1: "Under 18"
* 18: "18-24"
* 25: "25-34"
* 35: "35-44"
* 45: "45-49"
* 50: "50-55"
* 56: "56+"
* */
//get userid and user age
val userAgeInterval ="::")).map(x=> (x(0),x(2))).filter(_._2.equals("25")) //"users.dat": UserID::Gender::Age
//here just think there are a little data in usrage interval table ,broadcast ----data to ---> executor (one or many tasks in each exector )
//since the data has been filter by age ,so only using user id ,fine
val finalUserIdSet = mutable.HashSet() ++ // here should be use 2 ++
val finalUserIdSetBroadCast = sscc.broadcast(finalUserIdSet) // here define how to broadcast data set
//"ratings.dat": UserID::MovieID::Rating::Timestamp
//"movies.dat": MovieID::Title
val movieID2Nmae ="::")).map(x=> (x(0),x(1))).collect().toMap
val finalUserInfos ="::")).map(x => (x(0),x(1))).filter(x=> finalUserIdSetBroadCast.value.contains(x._1))
.map( x => (x._2,1)) // movied and calculate
.reduceByKey(_+_).sortBy(_._2,false).take(19) //false mean desc (2858,1334)
.map(x => (movieID2Nmae.getOrElse(x._1,null),x._2)) //(Men in Black (1997),971)
println("top n by age: ")
finalUserInfos.foreach(println )
while (true){} //using to check status by web