简易版HyperLogLog
实现逻辑:
采用简单调和平均数来计算,去除首尾10%的数据后进行估算。
Python代码:
import math
import random
import uuid
import hashlib
from bisect import bisect_right
HLL_ALPHA_INF = 0.721347520444481703680
class BitKeeper(object):
def __init__(self, base_bits=64):
self.base_bits = base_bits
self.max_bits = 0
def get_value_bits(self, keeper_value: int):
tmp_index = 1
for bit_index in range(1, 2 ** self.base_bits):
if keeper_value >> bit_index << bit_index != keeper_value:
tmp_index = bit_index
break
return tmp_index - 1
def add_item(self, keeper_value):
self.max_bits = max(self.max_bits, self.get_value_bits(keeper_value=keeper_value))
class MyHLL(object):
def __init__(self, keeper_bits=14, keeper_count=2 ** 14, base_bits=6):
self.base_bits = base_bits
self.keeper_bits = keeper_bits
self.keeper_count = keeper_count
self.keepers = [BitKeeper(self.base_bits) for index in range(keeper_count)]
def hll_add(self, item_value):
hash_value = int(hashlib.sha1(item_value.encode("utf-8")).hexdigest(), 16)
keeper_index = hash_value % len(self.keepers)
keeper_value = hash_value >> self.keeper_bits
keeper = self.keepers[keeper_index]
keeper.add_item(keeper_value=keeper_value)
def get_mid_bits(self):
mid_bits = []
for keeper in self.get_valid_keepers():
mid_bits.append(keeper.max_bits)
mid_bits.sort()
if len(mid_bits) > 100:
start_index = int(len(mid_bits) * 0.10)
stop_index = int(len(mid_bits) * 0.90)
return mid_bits[start_index:stop_index]
else:
return mid_bits
def get_valid_keepers(self):
keepers = []
for keeper in self.keepers:
if keeper.max_bits > 0:
keepers.append(keeper)
return keepers
def hll_count(self):
sum_inverse_bits = 0
valid_keepers = self.get_valid_keepers()
valid_keeper_count = len(valid_keepers)
mid_bits = self.get_mid_bits()
for mid_bit in mid_bits:
sum_inverse_bits += 1.0 / float(mid_bit)
avg_inverse_bits = float(len(mid_bits)) / sum_inverse_bits
return int(2 ** avg_inverse_bits * valid_keeper_count)
def demo(item_count: int):
myHLL = MyHLL(keeper_bits=14, keeper_count=2 ** 14, base_bits=6)
for check_index in range(item_count):
myHLL.hll_add(str(uuid.uuid4()))
estimate_count = myHLL.hll_count()
print("item_count:{item_count}, estimate_count:{estimate_count}, estimate_rate:{estimate_rate}%".format(
item_count=item_count,
estimate_count=estimate_count,
estimate_rate=int(estimate_count * 100 / item_count)
))
if __name__ == '__main__':
demo(10000)
demo(20000)
demo(30000)
demo(100000)
demo(200000)
demo(300000)
demo(1000000)
demo(2000000)
demo(3000000)
输出结果:
item_count:10000, estimate_count:11797, estimate_rate:117%
item_count:20000, estimate_count:21680, estimate_rate:108%
item_count:30000, estimate_count:30001, estimate_rate:100%
item_count:100000, estimate_count:80274, estimate_rate:80%
item_count:200000, estimate_count:172496, estimate_rate:86%
item_count:300000, estimate_count:277605, estimate_rate:92%
item_count:1000000, estimate_count:952530, estimate_rate:95%
item_count:2000000, estimate_count:1968881, estimate_rate:98%
item_count:3000000, estimate_count:2947463, estimate_rate:98%