需求
查询一个小时之内的用户访问量(一个用户算一个)
难点:如果用户量很多,要想用Set等数据结构实现去重不太现实,随时都会OOM,这时就得利用布隆过滤器,先判断user是否存在,不存在则计数+1,存在则不做计算,这样能节省大量的内存空间
利用Flink官方实现的布隆过滤器来实现
package project import java.lang import java.sql.Timestamp import org.apache.flink.api.common.functions.AggregateFunction import org.apache.flink.shaded.guava18.com.google.common.hash.{BloomFilter, Funnels} import org.apache.flink.streaming.api.TimeCharacteristic import org.apache.flink.streaming.api.scala._ import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction import org.apache.flink.streaming.api.windowing.time.Time import org.apache.flink.streaming.api.windowing.windows.TimeWindow import org.apache.flink.util.Collector // uv: unique visitor // 有多少用户访问过网站;pv按照userid去重 // 滑动窗口:窗口长度1小时,滑动距离5秒钟,每小时用户数量1亿 // 大数据去重的唯一解决方案:布隆过滤器 // 布隆过滤器的组成:bit数组,哈希函数 object UvByBloomFilterWithoutRedis { case class UserBehavior(userId: Long, itemId: Long, categoryId: Long, behavior: String, timestamp: Long) def main(args: Array[String]): Unit = { val env = StreamExecutionEnvironment.getExecutionEnvironment env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) env.setParallelism(1) val stream = env .readTextFile("D:\flink-tutorial\FlinkSZ1128\src\main\resources\UserBehavior.csv") .map(line => { val arr = line.split(",") UserBehavior(arr(0).toLong, arr(1).toLong, arr(2).toLong, arr(3), arr(4).toLong * 1000L) }) .filter(_.behavior.equals("pv")) .assignAscendingTimestamps(_.timestamp) // 分配升序时间戳 DataStream .map(r => ("key", r.userId)) .keyBy(_._1) .timeWindow(Time.hours(1)) .aggregate(new UvAggFunc,new UvProcessFunc) stream.print() env.execute() } //直接用聚合算子,【count,布隆过滤器】 class UvAggFunc extends AggregateFunction[(String,Long),(Long,BloomFilter[lang.Long]),Long]{ override def createAccumulator(): (Long, BloomFilter[lang.Long]) = (0,BloomFilter.create(Funnels.longFunnel(), 100000000, 0.01)) override def add(value: (String, Long), accumulator: (Long, BloomFilter[lang.Long])): (Long, BloomFilter[lang.Long]) = { var bloom: BloomFilter[lang.Long] = accumulator._2 var uvCount = accumulator._1 //通过布隆过滤器判断是否存在,不存在则+1 if(!bloom.mightContain(value._2)){ bloom.put(value._2) uvCount += 1 } (uvCount,bloom) } override def getResult(accumulator: (Long, BloomFilter[lang.Long])): Long = accumulator._1 //返回count override def merge(a: (Long, BloomFilter[lang.Long]), b: (Long, BloomFilter[lang.Long])): (Long, BloomFilter[lang.Long]) = ??? } class UvProcessFunc extends ProcessWindowFunction[Long, String, String, TimeWindow] { // 连接到redis override def process(key: String, context: Context, elements: Iterable[Long], out: Collector[String]): Unit = { // 窗口结束时间 ==> UV数 // 窗口结束时间 ==> bit数组 // 拿到key val start = new Timestamp(context.window.getStart) val end = new Timestamp(context.window.getEnd) out.collect(s"窗口开始时间为$start 到 $end 的uv 为 ${elements.head}") } } }
利用redis的bitmap自己手动实现一个简单的布隆过滤器
import java.sql.Timestamp import org.apache.flink.streaming.api.TimeCharacteristic import org.apache.flink.streaming.api.scala._ import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction import org.apache.flink.streaming.api.windowing.time.Time import org.apache.flink.streaming.api.windowing.triggers.{Trigger, TriggerResult} import org.apache.flink.streaming.api.windowing.windows.TimeWindow import org.apache.flink.util.Collector import redis.clients.jedis.Jedis // uv: unique visitor // 有多少用户访问过网站;pv按照userid去重 // 滑动窗口:窗口长度1小时,滑动距离5秒钟,每小时用户数量1亿 // 大数据去重的唯一解决方案:布隆过滤器 // 布隆过滤器的组成:bit数组,哈希函数 object UvByBloomFilter { case class UserBehavior(userId: Long, itemId: Long, categoryId: Long, behavior: String, timestamp: Long) def main(args: Array[String]): Unit = { val env = StreamExecutionEnvironment.getExecutionEnvironment env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) env.setParallelism(1) val stream = env .readTextFile("/Users/yuanzuo/Desktop/flink-tutorial/FlinkSZ1128/src/main/resources/UserBehavior.csv") .map(line => { val arr = line.split(",") UserBehavior(arr(0).toLong, arr(1).toLong, arr(2).toLong, arr(3), arr(4).toLong * 1000L) }) .filter(_.behavior.equals("pv")) .assignAscendingTimestamps(_.timestamp) // 分配升序时间戳 DataStream .map(r => ("key", r.userId)) .keyBy(_._1) .timeWindow(Time.hours(1)) .trigger(new UvTrigger) .process(new UvProcessFunc) stream.print() env.execute() } class UvTrigger extends Trigger[(String, Long), TimeWindow] { // 来一条元素调用一次 override def onElement(element: (String, Long), timestamp: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = { // 来一个事件,就触发一次窗口计算,并清空窗口 TriggerResult.FIRE_AND_PURGE } override def onProcessingTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = { TriggerResult.CONTINUE } override def onEventTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = { //窗口关闭是会触发该函数 val jedis = new Jedis("localhost", 6379) val windowEnd = window.getEnd.toString //从redis中读取结果并打印 println(new Timestamp(windowEnd.toLong), jedis.hget("UvCount", windowEnd))//在这打印时间 TriggerResult.CONTINUE } override def clear(window: TimeWindow, ctx: Trigger.TriggerContext): Unit = {} } class UvProcessFunc extends ProcessWindowFunction[(String, Long), String, String, TimeWindow] { // 连接到redis,用懒加载,只会加载一次 lazy val jedis = new Jedis("localhost", 6379) override def process(key: String, context: Context, elements: Iterable[(String, Long)], out: Collector[String]): Unit = { //redis存储数据类型 // 窗口结束时间 ==> UV数 // 窗口结束时间 ==> bit数组 // 拿到key val windowEnd = context.window.getEnd.toString var count = 0L if (jedis.hget("UvCount", windowEnd) != null) { count = jedis.hget("UvCount", windowEnd).toLong } // 迭代器中只有一条元素,因为每来一条元素,窗口清空一次,见trigger val userId = elements.head._2.toString // 计算userId对应的bit数组的下标 val idx = hash(userId, 1 << 20) // 判断userId是否访问过 if (!jedis.getbit(windowEnd, idx)) { // 对应的bit为0的话,返回false,用户一定没访问过 jedis.setbit(windowEnd, idx, true) // 将idx对应的bit翻转为1 jedis.hset("UvCount", windowEnd, (count + 1).toString)//写入结果 } } } // 为了方便理解,只实现一个哈希函数,返回值是Long,bit数组的下标 // value: 字符串;size:bit数组的长度 def hash(value: String, size: Long): Long = { val seed = 61 // 种子,必须是质数,能够很好的防止相撞 var result = 0L for (i <- 0 until value.length) { result = result * seed + value.charAt(i) } (size - 1) & result } }