• spark热门电影


    
    
    package movies

    import org.apache.spark.rdd.RDD
    import org.apache.spark.{SparkConf, SparkContext}

    object Movice {
    def main(args: Array[String]): Unit = {
    val cof = new SparkConf ()
    .setAppName ( this.getClass.getSimpleName )
    .setMaster ( "local[1]" )
    val sc = new SparkContext ( cof )

    val users1 = sc.textFile ( "D:\学习笔记\资料汇总\day02\资料\热门电影的数据\users.dat" )
    val movies1 = sc.textFile ( "D:\学习笔记\资料汇总\day02\资料\热门电影的数据\movies.dat" )
    val ratings1 = sc.textFile ( "D:\学习笔记\资料汇总\day02\资料\热门电影的数据\ratings.dat" )

    //1:评分(平均分)最高的10部电影 (moviceId, (userId, rating))
    val ratings2: RDD[(Int, (String, Int))] = ratings1.map ( tp => {
    val splits: Array[String] = tp.split ( "::" )
    val userId = splits ( 0 )
    val moviceId = splits ( 1 ).toInt
    val rating = splits ( 2 ).toInt
    (moviceId, (userId, rating))
    } )

    //(moviceId, (userId, 1))
    val rating4:RDD[(Int,(Int,Int))]=ratings2.map(tp=>{
    val rating=tp._2._2
    val moviceId=tp._1
    (moviceId,(rating,1))
    })

    val group2: RDD[(Int, Iterable[(Int, Int)])] =rating4.groupByKey()

    //聚合(movid,rtingsum,counsum)
    val rantresult1: RDD[(Int, Int, Int)] = group2.map(tp=>{
    val rantsum=tp._2.map(tp=>tp._1).sum
    val countsum=tp._2.map(_._2).sum
    (tp._1,rantsum,countsum)
    })
    // //取平均值
    // val ranresult2=rantresult1.map(tp=>{
    // (tp._1,tp._2/tp._3)
    // }).sortBy(-_._2).take(10).foreach(println)

    //2:18 - 24 岁的男性年轻人 最喜欢看的10部电影
    val users2: RDD[(Int, (String, Int))] = users1.map ( tp => {
    val splits: Array[String] = tp.split ( "::" )
    val userId = splits ( 0 ).toInt
    val gender = splits ( 1 )
    val age = splits ( 2 ).toInt

    (userId, (gender, age))
    } )

    val ratings3: RDD[(Int, String)] = ratings1.map ( tp => {
    val splits: Array[String] = tp.split ( "::" )
    val userId = splits ( 0 ).toInt
    val moviceId = splits ( 1 )
    (userId, moviceId)
    } )


    // users2.join ( ratings3 ).filter ( tp => {
    // tp._2._1._1.equals ( "M" )
    // tp._2._1._2 >= 18 && tp._2._1._2 <= 24
    // } ).map ( tp => (
    // tp._2._2, 1)
    // ).reduceByKey ( _ + _ ).sortBy ( -_._2 ).take ( 10 ).foreach ( println )

    //3:女性观看次数最多的10部电影名称及观看次数
    users2.join(ratings3).filter(tp=>{
    tp._2._1._1.equals("F")
    }).map(tp=>(
    tp._2._2,1
    )).reduceByKey(_+_).sortBy(-_._2).take(10).foreach(println)
     

    sc.stop()
    }
    }
     
  • 相关阅读:
    soapUI-DataSource
    Linux安装rpm包时报错Header V3 DSA/SHA1 Signature, key ID 1d1e034b: NOKEY解决办法
    Linux命令之rpm安装命令
    soapUi下载
    Red Hat Linux相关产品iso镜像下载
    RedHat Linux文本模式下乱码解决方法
    telnet到RedHat Linux失败--解决办法
    java list去重
    Java Mybatis 框架入门教程
    【阿里天池云-龙珠计划】薄书的机器学习笔记——K近邻(k-nearest neighbors)初探Task02
  • 原文地址:https://www.cnblogs.com/wangshuang123/p/11078358.html
Copyright © 2020-2023  润新知