• spart_排序、聚合、分区、随机数及散列(分区变更)


    排序

        val conf = new SparkConf().setMaster("local").setAppName("sort")
        val sc = new SparkContext(conf)
        //需求: 根据数据计算个网站的PV、UV、同时、只显示top3 // 设置三个分区
        //数据格式:199.111.148.214    重庆    2018-11-12    1542011088714    6755235587059844279    www.taobao.com    Comment
        val fileRDD = sc.textFile("data/bigdata-spark_data_pvuvdata.txt", 3)
        val mapRDD = fileRDD.map(line => (line.split("	")(5), 1))
        //www.taobao.com 1
        val wordCount = mapRDD.reduceByKey(_ + _)
        //www.taobao.com n
        val fanzhuan = wordCount.map(x => {
          (x._2, x._1)
        })
        //n www.taobao.com
        val sortKey = fanzhuan.sortByKey(false)
        val wordByTop5 = sortKey.map(_.swap) 
        // _.swap 互换、相当于(x=> {(x._2, x._1)
        val tuples = wordByTop5.take(3)
        tuples.foreach(println)
        /**
         * 也可以用top,不用排序
         * (www.taobao.com,18771)
         * (www.mi.com,18728)
         * (www.baidu.com,18636)
         */





    // 需求: 根据ip去重统计
    val mapList = fileRDD.map(line =>
    {
    val split = line.split(" ")
    (split(5), split(0))
    }
    )

    val disWords = mapList.distinct()
    // www.taobao.com 199.111.148.214
    val words = disWords.map(info => (info._1,1))
    //www.taobao.com 1
    val wordsCounts = words.reduceByKey(_+_)
    val tuples = wordsCounts.sortBy(_._2, false).take(5)

    /**
    * (www.taobao.com,15791)
    * (www.mi.com,15769)
    * (www.gome.com.cn,15740)
    * (www.dangdang.com,15690)
    * (www.baidu.com,15653)
    */
    tuples.foreach(println)
     

    聚合

        val conf = new SparkConf().setAppName("aggregation").setMaster("local")
        val sc = new SparkContext(conf)
        sc.setLogLevel("ERROR")
    
    
        val data = sc.parallelize(List(
          ("zhangsan",111),
          ("zhangsan",222),
          ("zhangsan",333),
          ("lisi",444),
          ("lisi",555),
          ("lisi",666),
          ("wangwu",777)
        ))
    
        val list02 = data.groupByKey()
        list02.foreach(println)
        /**
         * combineByKeyWithClassTag
         * (zhangsan,CompactBuffer(111, 222, 333))
         * (wangwu,CompactBuffer(777))
         * (lisi,CompactBuffer(444, 555, 666))
         */
    
    
        // 行转列  一变多使用flatMap
        val list3 = list02.flatMap(x=> x._2.map(e => (x._1, e)).iterator)
        list3.foreach(println)
        /**
         * (zhangsan,111)
         * (zhangsan,222)
         * (zhangsan,333)
         * (wangwu,777)
         * (lisi,444)
         * (lisi,555)
         * (lisi,666)
         */
        val list04 = list02.flatMapValues(e => e.iterator)
        list04.foreach(println)
        /**
         * 1.自动帮拼接key 2.可以不加iterator
         * (zhangsan,111)
         * (zhangsan,222)
         * (zhangsan,333)
         * (wangwu,777)
         * (lisi,444)
         * (lisi,555)
         * (lisi,666)
         */
        println("- - - - - - - -- - -")
          // 取值每个key的前两个
        list02.mapValues(e => e.toList.sorted.take(2)).foreach(println)
        /**
         * (zhangsan,List(111, 222))
         * (wangwu,List(777))
         * (lisi,List(444, 555))
         */
        list02.flatMapValues(e=> e.toList.sorted.take(2)).foreach(println)
        /**
         * (zhangsan,111)
         * (zhangsan,222)
         * (wangwu,777)
         * (lisi,444)
         * (lisi,555)
         */
    
        println("- - - - - - - -")
        val sum = data.reduceByKey(_+_)
        sum.foreach(println)
    
        /**
         * (zhangsan,666)
         * (wangwu,777)
         * (lisi,1665)
         */
        println("sum- - - - - - - -")
        val max = data.reduceByKey((up, down) => if(up > down) up else down)
        max.foreach(println)
    
        /**
         * (zhangsan,333)
         * (wangwu,777)
         * (lisi,666)
         */
        println("max- - - - - - - -")
        val min = data.reduceByKey((up, down) => if(up < down) up else down)
        min.foreach(println)
    
        /**
         * (zhangsan,111)
         * (wangwu,777)
         * (lisi,444)
         */
        println("min- - - - - - - -")
        val count = data.mapValues(e => 1).reduceByKey(_+_)
        count.foreach(println)
    
        /**
         * (zhangsan,3)
         * (wangwu,1)
         * (lisi,3)
         */
        println("count- - - - - - - -")
        val tmp = sum.join(count)
        tmp.foreach(println)
    
        /**
         * (zhangsan,(666,3))
         * (wangwu,(777,1))
         * (lisi,(1665,3))
         */
        println("tmp- - - - - - - -")
        val avg = tmp.mapValues(e => e._1/e._2) //拉取两次 计算多次
        avg.foreach(println)
    
        /**
         * (zhangsan,222)
         * (wangwu,777)
         * (lisi,555)
         */
    
        // 优化 -> 拉取一次 计算一次
        val tmpx = data.combineByKey(
          /**
           * 源码:
           * createCombiner: V => C,
           * mergeValue: (C, V) => C,
           * mergeCombiners: (C, C) => C,
           */
          // 第一条记录的value,怎么放入hashmap
          (value:Int) => (value, 1),
          // 如果有第二条,第二条及以后的value放入到hashMap里
          (oldValue:(Int, Int), newValue:Int) => (oldValue._1 + newValue, oldValue._2 + 1),
          // 合并溢血结果的函数
          (v1:(Int, Int), v2:(Int,Int)) => (v1._1 + v2._2, v1._2 + v2._2)
        )
    
        /**
         * (zhangsan,(666,3))
         * (wangwu,(777,1))
         * (lisi,(1665,3))
         */
        tmpx.foreach(println)
        println("- - - - - - -- - -  -- - -")
        tmpx.mapValues(e => e._1 / e._2).foreach(println)
    
        /**
         * (zhangsan,222)
         * (wangwu,777)
         * (lisi,555)
         */

    分区

        val conf = new SparkConf().setMaster("local").setAppName("partitions")
        val sc = new SparkContext(conf)
        sc.setLogLevel("ERROR")
    
        val data = sc.parallelize(1 to 4, 2)
        val sqlInfo = data.map(value =>{
          println("------conn--mysql----")
          println(s"-----select $value-----")
          println("-----close--mysql------")
          value + "selected"
        })
        sqlInfo.foreach(println)
    
        /**
         * 问题: 连接多次
         * ------conn--mysql----
         * -----select 1-----
         * -----close--mysql------
         * 1selected
         * ------conn--mysql----
         * -----select 2-----
         * -----close--mysql------
         * 2selected
         * ------conn--mysql----
         * -----select 3-----
         * -----close--mysql------
         * 3selected
         * ------conn--mysql----
         * -----select 4-----
         * -----close--mysql------
         * 4selected
         */
    
    
    
        val sqlInfo = data.mapPartitionsWithIndex(
          (p_index, p_iter)=> {
            val lb = new ListBuffer[String]
            println(s"--$p_index----conn--mysql----")
            while (p_iter.hasNext){
              val value = p_iter.next()
              println(s"-----select $value-----")
              lb += (value + "select")
            }
            println(s"--$p_index-----close--mysql------")
            lb.iterator
          }
        )
        sqlInfo.foreach(println)
    
        /**
         * 问题:虽然优化了mysql连接次数,但是带来新问题,val lb = new ListBuffer[String] 可能内存撑爆OOM
         * ------conn--mysql----
         * -----select 1-----
         * -----select 2-----
         * -----close--mysql------
         * 1select
         * 2select
         * ------conn--mysql----
         * -----select 3-----
         * -----select 4-----
         * -----close--mysql------
         * 3select
         * 4select
         */
    
    
    
        // 解决val lb = new ListBuffer[String] 内存OOM撑爆问题
        // 1.写入到文件. 写文件就不是一个高级架构师的思路,我们要规避io
        // 2.使用迭代器模式. 所以需要迭代器嵌套,不能让数据在中间缓存,防止OOM
        val  sqlInfo = data.mapPartitionsWithIndex(
          (p_index, p_iter) => {
    
            // 模仿map或flatMap的思路,map一进一出, flatMap一进多出(父类也是个迭代器就用flatMap,模仿它的源码)
            new Iterator[String] {
              println(s"--$p_index----conn--mysql----")
              override def hasNext = if(p_iter.hasNext == false){
                println(s"--$p_index-----close--mysql------")
                false
              }else true
    
              override def next() = {
                val value = p_iter.next()
                println(s"-----select $value-----")
                value + "selected"
              }
            }
          }
        )
        sqlInfo.foreach(println)
    
        /**
         * pipeline模式,来一个数据处理一个,处理特别快,不占用内存,而且一个分区只连接一次mysql
         * --0----conn--mysql----
         * -----select 1-----
         * 1selected
         * -----select 2-----
         * 2selected
         * --0-----close--mysql------
         * --1----conn--mysql----
         * -----select 3-----
         * 3selected
         * -----select 4-----
         * 4selected
         * --1-----close--mysql------
         */

    随机数及散列(分区变更)

        val conf = new SparkConf().setAppName("gaoji").setMaster("local")
        val sc = new SparkContext(conf)
        sc.setLogLevel("ERROR")
    
        val data = sc.parallelize(1 to 100)
        // sample(withReplacement,fraction,seed) 随机抽取几个数,withReplacement:是否重复抽取 fraction:10% seed: seed一样,抽取的值一样
        data.sample(false, 0.1).foreach(println)
        println("- - -- - - - -")
    
        data.sample(true, 0.1, 222).foreach(println)
        println("- - -- - - - -")
    
    
        val data1 = sc.parallelize(1 to 100, 5)
        println(s"data:${data1.getNumPartitions}") // 5
        val partitionData = data1.repartition(4) //5个分区切换为4个
        println(s"data:${partitionData.getNumPartitions}") // 4
        partitionData.foreach(println)
    
    
    
    
        val data2 = sc.parallelize(1 to 10, 5)
        val info2 = data2.mapPartitionsWithIndex(
          (p_index, p_iter)=>{
            p_iter.map(e => (p_index,e))
          }
        )
        info2.foreach(println)
    
        /**
         * 数据分配倒了五个分区
         * (0,1)
         * (0,2)
         * (1,3)
         * (1,4)
         * (2,5)
         * (2,6)
         * (3,7)
         * (3,8)
         * (4,9)
         * (4,10)
         */
    
    
    
    
    
        val data2 = sc.parallelize(1 to 10, 5)
        val date3 = info2.repartition(3)
        // repartition 无论调大调小都会触发shuffle
        // repartition底层使用coalesce(numPartitions, shuffle = true) 当分区变多时候,必须有shuffle(如果没有shuffle,增加的分区是没有数据的,因为没有进行重新计算,数据重新分配操作),当分区表少时候,可以不需要shuffle(把删除的分区挪移到最后一个分区上)
        val info4 = date3.mapPartitionsWithIndex(
          (p_index, p_iter)=>{
            p_iter.map(e => (p_index,e))
          }
        )
        info4.foreach(println)
    
        /**
         * 可以看到数据又进行了一次散列
         * (0,(1,4))
         * (0,(3,8))
         * (0,(4,10))
         * (1,(0,1))
         * (1,(2,5))
         * (2,(0,2))
         * (2,(1,3))
         * (2,(2,6))
         * (2,(3,7))
         * (2,(4,9))
         */
  • 相关阅读:
    一点关于this的理解
    BFC引发的关于position的思考
    JS HTML标签尺寸距离位置定位计算
    JS获取网页宽高方法集合
    JSDOM之节点
    并发- synchronized,锁
    公共文件下载-结构设计
    订单模块-结构设计
    ES-update
    ES使用笔记
  • 原文地址:https://www.cnblogs.com/bigdata-familyMeals/p/14383262.html
Copyright © 2020-2023  润新知