• spark解决hash碰撞


    问题背景:pairRDD

    调用spark存入文件的api时,最后结果文件的个数(就是分区的个数)取决于PariRDD中的key的hash值,

    解决后可是相同key的数据到同一个partitioner中

    解决方法:

    1. 自己重新定义一个partitioner
    
    
    //一般内部通过map来区分最好
    class HostPartitioner(hosts:Array[String]) extends Partitioner{
    //从host数组中进行partitioner编号的确定

    val hostMap = new mutable.HashMap[String,Int]()
    var count = 0
    for (host <- hosts){
    hostMap += (host -> count)
    count += 1
    }

    override def numPartitions: Int = hosts.length

    //key 是前面的host 有三个spark会自动将key传进来判断
    override def getPartition(key: Any): Int = {
    hostMap.getOrElse(key.toString,0)
    }
    }
     

    整个代码如下:

    package flowanalysis

    import java.net.URL

    import org.apache.spark.{Partitioner, SparkConf, SparkContext}

    import scala.collection.mutable

    /**
    * Created by willian on 2017/3/18.
    * 解决存放进hadoop中的产生的hash碰撞,导致文件的内容没有按key进分区
    */
    object FlowAnalysisPartitioner {
    def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("flow_analysis").setMaster("local")
    val sc = new SparkContext(conf)
    val rdd = sc.textFile("/Users/willian/Desktop/project/spark/wordcount/src/main/resources/itcast.log").map(line =>{
    val f = line.split(" ")
    (f(i = 1),1)
    })
    val rdd1 = rdd.reduceByKey(_+_)
    val rdd3 = rdd1.map(tuple =>{
    val url = tuple._1
    val host = new URL(url).getHost
    (host,(url,tuple._2))
    })
    val hostrdd = rdd3.map(_._1).distinct().collect()
    // rdd3.repartition(3).saveAsTextFile("/Users/willian/Desktop/project/spark/wordcount/src/main/output")
    // println(hostrdd.collect.toBuffer)
    val partitioner = new HostPartitioner(hostrdd)

    rdd3.partitionBy(partitioner).mapPartitions(it=>{
    it.toList.sortBy(_._2._2).reverse.take(3).iterator
    }).saveAsTextFile("/Users/willian/Desktop/project/spark/wordcount/src/main/output")

    }
    }

    class HostPartitioner(hosts:Array[String]) extends Partitioner{
    //从host数组中进行partitioner编号的确定

    val hostMap = new mutable.HashMap[String,Int]()
    var count = 0
    for (host <- hosts){
    hostMap += (host -> count)
    count += 1
    }

    override def numPartitions: Int = hosts.length

    //key 是前面的host 有三个spark会自动将key传进来判断
    override def getPartition(key: Any): Int = {
    hostMap.getOrElse(key.toString,0)
    }
    }
  • 相关阅读:
    ERROR com.opensymphony.xwork2.interceptor.ParametersInterceptor
    vscode中使用node服务调试,会在promise的reject出现断点报错
    koa-router匹配多个路由添加中间件函数
    react-router中的路由钩子使用
    在less中不能正常使用css3的calc属性的解决方法
    react-redux安装失败的问题
    npm脚本命令npm run script的使用
    npx的使用和理解
    babel的命令行工具babel-cli解析
    babel的.babelrc解析
  • 原文地址:https://www.cnblogs.com/zhangweilun/p/6576693.html
Copyright © 2020-2023  润新知