排序
val conf = new SparkConf().setMaster("local").setAppName("sort") val sc = new SparkContext(conf) //需求: 根据数据计算个网站的PV、UV、同时、只显示top3 // 设置三个分区 //数据格式:199.111.148.214 重庆 2018-11-12 1542011088714 6755235587059844279 www.taobao.com Comment val fileRDD = sc.textFile("data/bigdata-spark_data_pvuvdata.txt", 3) val mapRDD = fileRDD.map(line => (line.split(" ")(5), 1)) //www.taobao.com 1 val wordCount = mapRDD.reduceByKey(_ + _) //www.taobao.com n val fanzhuan = wordCount.map(x => { (x._2, x._1) }) //n www.taobao.com val sortKey = fanzhuan.sortByKey(false) val wordByTop5 = sortKey.map(_.swap) // _.swap 互换、相当于(x=> {(x._2, x._1) val tuples = wordByTop5.take(3) tuples.foreach(println) /** * 也可以用top,不用排序 * (www.taobao.com,18771) * (www.mi.com,18728) * (www.baidu.com,18636) */
// 需求: 根据ip去重统计
val mapList = fileRDD.map(line =>
{
val split = line.split(" ")
(split(5), split(0))
}
)
val disWords = mapList.distinct()
// www.taobao.com 199.111.148.214
val words = disWords.map(info => (info._1,1))
//www.taobao.com 1
val wordsCounts = words.reduceByKey(_+_)
val tuples = wordsCounts.sortBy(_._2, false).take(5)
/**
* (www.taobao.com,15791)
* (www.mi.com,15769)
* (www.gome.com.cn,15740)
* (www.dangdang.com,15690)
* (www.baidu.com,15653)
*/
tuples.foreach(println)
聚合
val conf = new SparkConf().setAppName("aggregation").setMaster("local") val sc = new SparkContext(conf) sc.setLogLevel("ERROR") val data = sc.parallelize(List( ("zhangsan",111), ("zhangsan",222), ("zhangsan",333), ("lisi",444), ("lisi",555), ("lisi",666), ("wangwu",777) )) val list02 = data.groupByKey() list02.foreach(println) /** * combineByKeyWithClassTag * (zhangsan,CompactBuffer(111, 222, 333)) * (wangwu,CompactBuffer(777)) * (lisi,CompactBuffer(444, 555, 666)) */ // 行转列 一变多使用flatMap val list3 = list02.flatMap(x=> x._2.map(e => (x._1, e)).iterator) list3.foreach(println) /** * (zhangsan,111) * (zhangsan,222) * (zhangsan,333) * (wangwu,777) * (lisi,444) * (lisi,555) * (lisi,666) */ val list04 = list02.flatMapValues(e => e.iterator) list04.foreach(println) /** * 1.自动帮拼接key 2.可以不加iterator * (zhangsan,111) * (zhangsan,222) * (zhangsan,333) * (wangwu,777) * (lisi,444) * (lisi,555) * (lisi,666) */ println("- - - - - - - -- - -") // 取值每个key的前两个 list02.mapValues(e => e.toList.sorted.take(2)).foreach(println) /** * (zhangsan,List(111, 222)) * (wangwu,List(777)) * (lisi,List(444, 555)) */ list02.flatMapValues(e=> e.toList.sorted.take(2)).foreach(println) /** * (zhangsan,111) * (zhangsan,222) * (wangwu,777) * (lisi,444) * (lisi,555) */ println("- - - - - - - -") val sum = data.reduceByKey(_+_) sum.foreach(println) /** * (zhangsan,666) * (wangwu,777) * (lisi,1665) */ println("sum- - - - - - - -") val max = data.reduceByKey((up, down) => if(up > down) up else down) max.foreach(println) /** * (zhangsan,333) * (wangwu,777) * (lisi,666) */ println("max- - - - - - - -") val min = data.reduceByKey((up, down) => if(up < down) up else down) min.foreach(println) /** * (zhangsan,111) * (wangwu,777) * (lisi,444) */ println("min- - - - - - - -") val count = data.mapValues(e => 1).reduceByKey(_+_) count.foreach(println) /** * (zhangsan,3) * (wangwu,1) * (lisi,3) */ println("count- - - - - - - -") val tmp = sum.join(count) tmp.foreach(println) /** * (zhangsan,(666,3)) * (wangwu,(777,1)) * (lisi,(1665,3)) */ println("tmp- - - - - - - -") val avg = tmp.mapValues(e => e._1/e._2) //拉取两次 计算多次 avg.foreach(println) /** * (zhangsan,222) * (wangwu,777) * (lisi,555) */ // 优化 -> 拉取一次 计算一次 val tmpx = data.combineByKey( /** * 源码: * createCombiner: V => C, * mergeValue: (C, V) => C, * mergeCombiners: (C, C) => C, */ // 第一条记录的value,怎么放入hashmap (value:Int) => (value, 1), // 如果有第二条,第二条及以后的value放入到hashMap里 (oldValue:(Int, Int), newValue:Int) => (oldValue._1 + newValue, oldValue._2 + 1), // 合并溢血结果的函数 (v1:(Int, Int), v2:(Int,Int)) => (v1._1 + v2._2, v1._2 + v2._2) ) /** * (zhangsan,(666,3)) * (wangwu,(777,1)) * (lisi,(1665,3)) */ tmpx.foreach(println) println("- - - - - - -- - - -- - -") tmpx.mapValues(e => e._1 / e._2).foreach(println) /** * (zhangsan,222) * (wangwu,777) * (lisi,555) */
分区
val conf = new SparkConf().setMaster("local").setAppName("partitions") val sc = new SparkContext(conf) sc.setLogLevel("ERROR") val data = sc.parallelize(1 to 4, 2) val sqlInfo = data.map(value =>{ println("------conn--mysql----") println(s"-----select $value-----") println("-----close--mysql------") value + "selected" }) sqlInfo.foreach(println) /** * 问题: 连接多次 * ------conn--mysql---- * -----select 1----- * -----close--mysql------ * 1selected * ------conn--mysql---- * -----select 2----- * -----close--mysql------ * 2selected * ------conn--mysql---- * -----select 3----- * -----close--mysql------ * 3selected * ------conn--mysql---- * -----select 4----- * -----close--mysql------ * 4selected */ val sqlInfo = data.mapPartitionsWithIndex( (p_index, p_iter)=> { val lb = new ListBuffer[String] println(s"--$p_index----conn--mysql----") while (p_iter.hasNext){ val value = p_iter.next() println(s"-----select $value-----") lb += (value + "select") } println(s"--$p_index-----close--mysql------") lb.iterator } ) sqlInfo.foreach(println) /** * 问题:虽然优化了mysql连接次数,但是带来新问题,val lb = new ListBuffer[String] 可能内存撑爆OOM * ------conn--mysql---- * -----select 1----- * -----select 2----- * -----close--mysql------ * 1select * 2select * ------conn--mysql---- * -----select 3----- * -----select 4----- * -----close--mysql------ * 3select * 4select */ // 解决val lb = new ListBuffer[String] 内存OOM撑爆问题 // 1.写入到文件. 写文件就不是一个高级架构师的思路,我们要规避io // 2.使用迭代器模式. 所以需要迭代器嵌套,不能让数据在中间缓存,防止OOM val sqlInfo = data.mapPartitionsWithIndex( (p_index, p_iter) => { // 模仿map或flatMap的思路,map一进一出, flatMap一进多出(父类也是个迭代器就用flatMap,模仿它的源码) new Iterator[String] { println(s"--$p_index----conn--mysql----") override def hasNext = if(p_iter.hasNext == false){ println(s"--$p_index-----close--mysql------") false }else true override def next() = { val value = p_iter.next() println(s"-----select $value-----") value + "selected" } } } ) sqlInfo.foreach(println) /** * pipeline模式,来一个数据处理一个,处理特别快,不占用内存,而且一个分区只连接一次mysql * --0----conn--mysql---- * -----select 1----- * 1selected * -----select 2----- * 2selected * --0-----close--mysql------ * --1----conn--mysql---- * -----select 3----- * 3selected * -----select 4----- * 4selected * --1-----close--mysql------ */
随机数及散列(分区变更)
val conf = new SparkConf().setAppName("gaoji").setMaster("local") val sc = new SparkContext(conf) sc.setLogLevel("ERROR") val data = sc.parallelize(1 to 100) // sample(withReplacement,fraction,seed) 随机抽取几个数,withReplacement:是否重复抽取 fraction:10% seed: seed一样,抽取的值一样 data.sample(false, 0.1).foreach(println) println("- - -- - - - -") data.sample(true, 0.1, 222).foreach(println) println("- - -- - - - -") val data1 = sc.parallelize(1 to 100, 5) println(s"data:${data1.getNumPartitions}") // 5 val partitionData = data1.repartition(4) //5个分区切换为4个 println(s"data:${partitionData.getNumPartitions}") // 4 partitionData.foreach(println) val data2 = sc.parallelize(1 to 10, 5) val info2 = data2.mapPartitionsWithIndex( (p_index, p_iter)=>{ p_iter.map(e => (p_index,e)) } ) info2.foreach(println) /** * 数据分配倒了五个分区 * (0,1) * (0,2) * (1,3) * (1,4) * (2,5) * (2,6) * (3,7) * (3,8) * (4,9) * (4,10) */ val data2 = sc.parallelize(1 to 10, 5) val date3 = info2.repartition(3) // repartition 无论调大调小都会触发shuffle // repartition底层使用coalesce(numPartitions, shuffle = true) 当分区变多时候,必须有shuffle(如果没有shuffle,增加的分区是没有数据的,因为没有进行重新计算,数据重新分配操作),当分区表少时候,可以不需要shuffle(把删除的分区挪移到最后一个分区上) val info4 = date3.mapPartitionsWithIndex( (p_index, p_iter)=>{ p_iter.map(e => (p_index,e)) } ) info4.foreach(println) /** * 可以看到数据又进行了一次散列 * (0,(1,4)) * (0,(3,8)) * (0,(4,10)) * (1,(0,1)) * (1,(2,5)) * (2,(0,2)) * (2,(1,3)) * (2,(2,6)) * (2,(3,7)) * (2,(4,9)) */