• Flink 用布隆过滤器来实现UV统计


    需求

      查询一个小时之内的用户访问量(一个用户算一个)

    难点:如果用户量很多,要想用Set等数据结构实现去重不太现实,随时都会OOM,这时就得利用布隆过滤器,先判断user是否存在,不存在则计数+1,存在则不做计算,这样能节省大量的内存空间

    利用Flink官方实现的布隆过滤器来实现

    package project
    
    import java.lang
    import java.sql.Timestamp
    
    import org.apache.flink.api.common.functions.AggregateFunction
    import org.apache.flink.shaded.guava18.com.google.common.hash.{BloomFilter, Funnels}
    import org.apache.flink.streaming.api.TimeCharacteristic
    import org.apache.flink.streaming.api.scala._
    import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
    import org.apache.flink.streaming.api.windowing.time.Time
    import org.apache.flink.streaming.api.windowing.windows.TimeWindow
    import org.apache.flink.util.Collector
    
    
    // uv: unique visitor
    // 有多少用户访问过网站;pv按照userid去重
    // 滑动窗口:窗口长度1小时,滑动距离5秒钟,每小时用户数量1亿
    // 大数据去重的唯一解决方案:布隆过滤器
    // 布隆过滤器的组成:bit数组,哈希函数
    object UvByBloomFilterWithoutRedis {
    
      case class UserBehavior(userId: Long,
                              itemId: Long,
                              categoryId: Long,
                              behavior: String,
                              timestamp: Long)
    
      def main(args: Array[String]): Unit = {
        val env = StreamExecutionEnvironment.getExecutionEnvironment
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
        env.setParallelism(1)
    
        val stream = env
          .readTextFile("D:\flink-tutorial\FlinkSZ1128\src\main\resources\UserBehavior.csv")
          .map(line => {
            val arr = line.split(",")
            UserBehavior(arr(0).toLong, arr(1).toLong, arr(2).toLong, arr(3), arr(4).toLong * 1000L)
          })
          .filter(_.behavior.equals("pv"))
          .assignAscendingTimestamps(_.timestamp) // 分配升序时间戳 DataStream
          .map(r => ("key", r.userId))
          .keyBy(_._1)
          .timeWindow(Time.hours(1))
          .aggregate(new UvAggFunc,new UvProcessFunc)
    
        stream.print()
        env.execute()
      }
    
      //直接用聚合算子,【count,布隆过滤器】
      class UvAggFunc extends AggregateFunction[(String,Long),(Long,BloomFilter[lang.Long]),Long]{
        override def createAccumulator(): (Long, BloomFilter[lang.Long]) = (0,BloomFilter.create(Funnels.longFunnel(), 100000000, 0.01))
    
        override def add(value: (String, Long), accumulator: (Long, BloomFilter[lang.Long])): (Long, BloomFilter[lang.Long]) = {
          var bloom: BloomFilter[lang.Long] = accumulator._2
          var uvCount = accumulator._1
          //通过布隆过滤器判断是否存在,不存在则+1
          if(!bloom.mightContain(value._2)){
            bloom.put(value._2)
            uvCount += 1
          }
          (uvCount,bloom)
        }
    
        override def getResult(accumulator: (Long, BloomFilter[lang.Long])): Long = accumulator._1 //返回count
    
        override def merge(a: (Long, BloomFilter[lang.Long]), b: (Long, BloomFilter[lang.Long])): (Long, BloomFilter[lang.Long]) = ???
      }
      class UvProcessFunc extends ProcessWindowFunction[Long, String, String, TimeWindow] {
        // 连接到redis
        override def process(key: String, context: Context, elements: Iterable[Long], out: Collector[String]): Unit = {
          // 窗口结束时间 ==> UV数
          // 窗口结束时间 ==> bit数组
    
          // 拿到key
          val start = new Timestamp(context.window.getStart)
          val end = new Timestamp(context.window.getEnd)
            out.collect(s"窗口开始时间为$start 到 $end 的uv 为 ${elements.head}")
          }
    
    
        }
    
    }

    利用redis的bitmap自己手动实现一个简单的布隆过滤器

    import java.sql.Timestamp
    
    import org.apache.flink.streaming.api.TimeCharacteristic
    import org.apache.flink.streaming.api.scala._
    import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
    import org.apache.flink.streaming.api.windowing.time.Time
    import org.apache.flink.streaming.api.windowing.triggers.{Trigger, TriggerResult}
    import org.apache.flink.streaming.api.windowing.windows.TimeWindow
    import org.apache.flink.util.Collector
    import redis.clients.jedis.Jedis
    
    // uv: unique visitor
    // 有多少用户访问过网站;pv按照userid去重
    // 滑动窗口:窗口长度1小时,滑动距离5秒钟,每小时用户数量1亿
    // 大数据去重的唯一解决方案:布隆过滤器
    // 布隆过滤器的组成:bit数组,哈希函数
    object UvByBloomFilter {
    
      case class UserBehavior(userId: Long,
                              itemId: Long,
                              categoryId: Long,
                              behavior: String,
                              timestamp: Long)
    
      def main(args: Array[String]): Unit = {
        val env = StreamExecutionEnvironment.getExecutionEnvironment
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
        env.setParallelism(1)
    
        val stream = env
          .readTextFile("/Users/yuanzuo/Desktop/flink-tutorial/FlinkSZ1128/src/main/resources/UserBehavior.csv")
          .map(line => {
            val arr = line.split(",")
            UserBehavior(arr(0).toLong, arr(1).toLong, arr(2).toLong, arr(3), arr(4).toLong * 1000L)
          })
          .filter(_.behavior.equals("pv"))
          .assignAscendingTimestamps(_.timestamp) // 分配升序时间戳 DataStream
          .map(r => ("key", r.userId))
          .keyBy(_._1)
          .timeWindow(Time.hours(1))
          .trigger(new UvTrigger)
          .process(new UvProcessFunc)
    
        stream.print()
        env.execute()
      }
    
      class UvTrigger extends Trigger[(String, Long), TimeWindow] {
        // 来一条元素调用一次
        override def onElement(element: (String, Long), timestamp: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = {
          // 来一个事件,就触发一次窗口计算,并清空窗口
          TriggerResult.FIRE_AND_PURGE
        }
    
        override def onProcessingTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = {
          TriggerResult.CONTINUE
        }
    
        override def onEventTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = {
          //窗口关闭是会触发该函数
            val jedis = new Jedis("localhost", 6379)
            val windowEnd = window.getEnd.toString
          //从redis中读取结果并打印
            println(new Timestamp(windowEnd.toLong), jedis.hget("UvCount", windowEnd))//在这打印时间
    
          TriggerResult.CONTINUE
        }
    
        override def clear(window: TimeWindow, ctx: Trigger.TriggerContext): Unit = {}
      }
    
      class UvProcessFunc extends ProcessWindowFunction[(String, Long), String, String, TimeWindow] {
        // 连接到redis,用懒加载,只会加载一次
        lazy val jedis = new Jedis("localhost", 6379)
    
        override def process(key: String, context: Context, elements: Iterable[(String, Long)], out: Collector[String]): Unit = {
          //redis存储数据类型
            // 窗口结束时间 ==> UV数
            // 窗口结束时间 ==> bit数组
    
          // 拿到key
          val windowEnd = context.window.getEnd.toString
    
          var count = 0L
    
          if (jedis.hget("UvCount", windowEnd) != null) {
            count = jedis.hget("UvCount", windowEnd).toLong
          }
    
          // 迭代器中只有一条元素,因为每来一条元素,窗口清空一次,见trigger
          val userId = elements.head._2.toString
          // 计算userId对应的bit数组的下标
          val idx = hash(userId, 1 << 20)
    
          // 判断userId是否访问过
          if (!jedis.getbit(windowEnd, idx)) { // 对应的bit为0的话,返回false,用户一定没访问过
            jedis.setbit(windowEnd, idx, true) // 将idx对应的bit翻转为1
            jedis.hset("UvCount", windowEnd, (count + 1).toString)//写入结果
          }
        }
      }
    
      // 为了方便理解,只实现一个哈希函数,返回值是Long,bit数组的下标
      // value: 字符串;size:bit数组的长度
      def hash(value: String, size: Long): Long = {
        val seed = 61 // 种子,必须是质数,能够很好的防止相撞
        var result = 0L
        for (i <- 0 until value.length) {
          result = result * seed + value.charAt(i)
        }
        (size - 1) & result
      }
    }
  • 相关阅读:
    九度oj 题目1208:10进制 VS 2进制
    九度oj 题目1209:最小邮票数
    九度oj 题目1207:质因数的个数
    九度oj 题目1030:毕业bg
    九度oj 题目1014:排名
    九度oj 题目1048:判断三角形类型
    九度oj 题目1335:闯迷宫
    [Luogu] Tree
    点分治 算法学习 && [Poj] 1741
    [Luogu] 排序机械臂
  • 原文地址:https://www.cnblogs.com/yangxusun9/p/13170509.html
Copyright © 2020-2023  润新知