• Redis随记--HyperLogLog的代码学习01


    基础结构和常量

    struct hllhdr {
        char magic[4];      /* "HYLL" */
        uint8_t encoding;   /* HLL_DENSE or HLL_SPARSE. */
        uint8_t notused[3]; /* Reserved for future use, must be zero. */
        uint8_t card[8];    /* Cached cardinality, little endian. */
        uint8_t registers[]; /* Data bytes. */
    };
    
    /* The cached cardinality MSB is used to signal validity of the cached value. */
    #define HLL_INVALIDATE_CACHE(hdr) (hdr)->card[7] |= (1<<7)
    #define HLL_VALID_CACHE(hdr) (((hdr)->card[7] & (1<<7)) == 0)
    
    #define HLL_P 14 /* The greater is P, the smaller the error. */
    #define HLL_Q (64-HLL_P) /* The number of bits of the hash value used for
                                determining the number of leading zeros. */
    #define HLL_REGISTERS (1<<HLL_P) /* With P=14, 16384 registers. */
    #define HLL_P_MASK (HLL_REGISTERS-1) /* Mask to index register. */
    #define HLL_BITS 6 /* Enough to count up to 63 leading zeroes. */
    #define HLL_REGISTER_MAX ((1<<HLL_BITS)-1)
    #define HLL_HDR_SIZE sizeof(struct hllhdr)
    #define HLL_DENSE_SIZE (HLL_HDR_SIZE+((HLL_REGISTERS*HLL_BITS+7)/8))
    #define HLL_DENSE 0 /* Dense encoding. */
    #define HLL_SPARSE 1 /* Sparse encoding. */
    #define HLL_RAW 255 /* Only used internally, never exposed. */
    #define HLL_MAX_ENCODING 1
    #define HLL_ALPHA_INF 0.721347520444481703680 /* constant for 0.5/ln(2) */
    

    相关函数

    hllCount 函数

    /* Return the approximated cardinality of the set based on the harmonic
     * mean of the registers values. 'hdr' points to the start of the SDS
     * representing the String object holding the HLL representation.
     *
     * If the sparse representation of the HLL object is not valid, the integer
     * pointed by 'invalid' is set to non-zero, otherwise it is left untouched.
     *
     * hllCount() supports a special internal-only encoding of HLL_RAW, that
     * is, hdr->registers will point to an uint8_t array of HLL_REGISTERS element.
     * This is useful in order to speedup PFCOUNT when called against multiple
     * keys (no need to work with 6-bit integers encoding). */
    uint64_t hllCount(struct hllhdr *hdr, int *invalid) {
        double m = HLL_REGISTERS;
        double E;
        int j;
        /* Note that reghisto size could be just HLL_Q+2, becuase HLL_Q+1 is
         * the maximum frequency of the "000...1" sequence the hash function is
         * able to return. However it is slow to check for sanity of the
         * input: instead we history array at a safe size: overflows will
         * just write data to wrong, but correctly allocated, places. */
        int reghisto[64] = {0};
    
        /* Compute register histogram */
        if (hdr->encoding == HLL_DENSE) {
            hllDenseRegHisto(hdr->registers,reghisto);
        } else if (hdr->encoding == HLL_SPARSE) {
            hllSparseRegHisto(hdr->registers,
                             sdslen((sds)hdr)-HLL_HDR_SIZE,invalid,reghisto);
        } else if (hdr->encoding == HLL_RAW) {
            hllRawRegHisto(hdr->registers,reghisto);
        } else {
            serverPanic("Unknown HyperLogLog encoding in hllCount()");
        }
    
        /* Estimate cardinality form register histogram. See:
         * "New cardinality estimation algorithms for HyperLogLog sketches"
         * Otmar Ertl, arXiv:1702.01284 */
        double z = m * hllTau((m-reghisto[HLL_Q+1])/(double)m);
        for (j = HLL_Q; j >= 1; --j) {
            z += reghisto[j];
            z *= 0.5;
        }
        z += m * hllSigma(reghisto[0]/(double)m);
        E = llroundl(HLL_ALPHA_INF*m*m/z);
    
        return (uint64_t) E;
    }
    

    hllRawRegHisto函数

    /* Implements the register histogram calculation for uint8_t data type
     * which is only used internally as speedup for PFCOUNT with multiple keys. */
    void hllRawRegHisto(uint8_t *registers, int* reghisto) {
        uint64_t *word = (uint64_t*) registers;
        uint8_t *bytes;
        int j;
    
        for (j = 0; j < HLL_REGISTERS/8; j++) {
            if (*word == 0) {
                reghisto[0] += 8;
            } else {
                bytes = (uint8_t*) word;
                reghisto[bytes[0]]++;
                reghisto[bytes[1]]++;
                reghisto[bytes[2]]++;
                reghisto[bytes[3]]++;
                reghisto[bytes[4]]++;
                reghisto[bytes[5]]++;
                reghisto[bytes[6]]++;
                reghisto[bytes[7]]++;
            }
            word++;
        }
    }
    

    hllTau函数

    /* Helper function tau as defined in
     * "New cardinality estimation algorithms for HyperLogLog sketches"
     * Otmar Ertl, arXiv:1702.01284 */
    double hllTau(double x) {
        if (x == 0. || x == 1.) return 0.;
        double zPrime;
        double y = 1.0;
        double z = 1 - x;
        do {
            x = sqrt(x);
            zPrime = z;
            y *= 0.5;
            z -= pow(1 - x, 2)*y;
        } while(zPrime != z);
        return z / 3;
    }
    

    hllSigma函数

    /* Helper function sigma as defined in
     * "New cardinality estimation algorithms for HyperLogLog sketches"
     * Otmar Ertl, arXiv:1702.01284 */
    double hllSigma(double x) {
        if (x == 1.) return INFINITY;
        double zPrime;
        double y = 1;
        double z = x;
        do {
            x *= x;
            zPrime = z;
            z += x * y;
            y += y;
        } while(zPrime != z);
        return z;
    }
    
  • 相关阅读:
    Linux下Apache服务器并发优化
    centos 7 mount win共享文件夹 开机自动挂载
    自学 phpredis 的心路历程
    VM虚拟机下centos7 无法上网的问题解决办法
    php headers_sent 函数的作用
    is_file 与 file_exists 的区别
    php 面向对象 中的self
    php 去除所有空格 包括中文空格圆角空格
    滑动窗口滚动条触发事件
    PHP中file_exists与is_file、is_dir的区别,以及执行效率的比较 转自#冰雪傲骨#
  • 原文地址:https://www.cnblogs.com/gaogao67/p/14444377.html
Copyright © 2020-2023  润新知