• LD-sketch源码阅读


    util.h

    IP转换函数,将二进制的IP地址转化为点分十进制的形式。
    
    /*
     * Convert IP (in network order) to string
     */
    inline char* ip2a(uint32_t ip, char* addr) {
        sprintf(addr, "%d.%d.%d.%d", ip & 0xff, (ip >> 8) & 0xff, (ip >> 16) &
                0xff, (ip >> 24) & 0xff);
        return addr;
    }
    

    hash.hpp/cpp

    mangle函数

    为解决在不同机器上 C++的二进制兼容问题,使用mangle函数转化源码中的函数,因为在不同的编译器或不同的编译器版本,编译后的名称可能会有不同。
    (或是使用seq-hash,测量heavy hitter heavy changers,提升了内存使用率,减少了计算开销,最关键的是可以恢复流的信息
    把一个key值按照位分为几个段,分别hash进一个二维表里面,最后再把这些二维表的值合并出这个key)

    void mangle(const unsigned char* key, unsigned char* ret_key,
    		int nbytes) {
    	unsigned long long new_key = 0;
    	int i;
    	for (i=0; i<nbytes; ++i) {
    		new_key |= key[nbytes-i-1] << (i * 8);
    	}
    	new_key = (new_key * 2083697005) & (0xffffffff);
    	for (i=0; i<nbytes; ++i) {
    		ret_key[i] = (new_key >> (i * 8)) & 0xff;
    	}
    }
    

    GenHashSeed函数

    产生一个随机数的种子返回值。

    uint64_t seed = 0;
    uint64_t GenHashSeed(int index) {
        /*
        if (index == 0) {
            srand(0);
        }
        */
        if (seed == 0) {
            seed = rand();
        }
        uint64_t x, y = seed + index;
        mangle((const unsigned char*)&y, (unsigned char*)&x, 8);
        return AwareHash((uint8_t*)&y, 8, 388650253, 388650319, 1176845762);
    }
    

    AwareHash模块

    接受参数:原数据、原数据长度。哈希值的初始值为388650253,乘以规模388650319,再和数据相加后,向后移动一个数据,数据有多长就跑几次。数据中心的流量是非常巨大的,要将对如此巨大的流量产生随机分布的哈希值,这个哈希值也将非常巨大,也难怪在自己的虚拟机上,把数值调小了几倍还是溢出。最后对1176845762进行异或处理

    /**
     * hash function
     * @param data the binary to be hashed
     * @param n the length of binary to be hashed
     */
    static unsigned int AwareHash(const unsigned char* data, unsigned int n) {
    	unsigned int hash = 388650253;
    	unsigned int scale = 388650319;
    	unsigned int hardener  = 1176845762;
    	while( n ) {
    		hash *= scale;
    		hash += *data++;
    		n--;
    	}
    	return hash ^ hardener;
    }
    

    LDSketch.hpp/cpp

    LDSketch更新函数,对一个sketch插入键值对。

    可在前部分使用mangle函数encode key值。利用LD_Sketch_find 函数,找到一个位置,再更新到桶内

    /*
     * Sequential hashign algorithm - encode keys (a.k.a. update step)
     */
    void LDSketch_update(LDSketch_t* sk, unsigned char* key, long long val) {
    	int j, k;
    
    	// mangle
    	// mangle(key, key_to_add, tbl->n/8);
    
    	// add key/val to table
    	for (j=0; j<sk->h; ++j) {
    		k = LDSketch_find(sk, key, 0, sk->lgn - 1, j);
    		//tbl->T[j*tbl->w+k] += val;
    		dyn_tbl_update(sk->tbl[j*sk->w+k], key, val);
    	}
        //tbl->total += val;
    }
    

    LD_Sketch的find函数

    找到一个sketch内的key的哈希值。把这个key的副本初始化为0,把key值的每一位复制到副本当中,针对这个副本的前八位按照某种方法处理一次,之后的位置就把key原值复制进去。operator为列数乘以哈希函数的id号加上用户输入的桶的位置,并把operator加入这个桶中。输入这个副本和长度,返回这个键值的哈希值。

    unsigned int LDSketch_find(LDSketch_t* tbl, const unsigned char* key, int start_bit,
    		int end_bit, int row_no) {
    	unsigned char key_str[50];		// assume n/8 + 4 <= 50
    	unsigned int oper;
    	char bit;
    	unsigned int ret_bucket;
    
    	int i;
    
    	// set the key string
    	memset(key_str, 0, sizeof(key_str));
    	i = start_bit;		// start_bit == 0 in all cases
    	while (i <= end_bit) {
    		if (end_bit - i + 1 >= 8) {
    			key_str[i/8] = key[i/8];	
    			i += 8;
    		} else {
    			bit = (key[i/8] & (1 << (8 - ((i%8) + 1)))) > 0 ?  1 : 0;
    			key_str[i/8] |= (bit << (8 - ((i%8) + 1)));
    			i++;
    		}
    	}
    
    	// set the operator and add it to key string
    	//oper = part_no * tbl->[part_no] + array_no;
    	oper = tbl->h * tbl->tbl_id + row_no;
    	//oper = row_no;
    	memcpy(key_str + tbl->lgn/8, &oper, sizeof(unsigned int));
    
    	/*
    	// hash
    	MD5_CTX md5;
    	unsigned char digest[16];
    	MD5_Init(&md5);
    	MD5_Update(&md5, key_str, tbl->n/8 + sizeof(unsigned int));
    	MD5_Final(digest, &md5);
    	memcpy(&ret_bucket, digest, sizeof(unsigned int));	// take 1st 4 bytes
    	ret_bucket = (ret_bucket % tbl->K);
    	*/
    
    	ret_bucket = AwareHash(key_str, 
    			(unsigned int)(tbl->lgn/8 + sizeof(unsigned int))) % (tbl->w);
    
    	// return
    	return ret_bucket;
    }
    

    dyn_tbl.hpp/cpp

    关键词

    关键字的定义,最大长度为13字节。

    #define MAX_KEYLEN 13
    
    /**
     * Structure of key
     */
    typedef struct dyn_tbl_key_s {
        /// 13-byte key
        unsigned char key[MAX_KEYLEN];
    } dyn_tbl_key_t;
    
    

    哈希键值

    定义一个哈希函数的对象。返回该键值的哈希值

    /**
     * Object for hash
     */
    typedef struct {
        /// overloaded operation
        long operator() (const dyn_tbl_key_t &k) const { return AwareHash((unsigned char*)k.key, MAX_KEYLEN); }
    } dyn_tbl_key_hash;
    
    

    比较函数

    定义一个比较函数,判断两个字串是否相等

    /**
     * Object for equality
     */
    typedef struct {
        /// overloaded operation
        bool operator() (const dyn_tbl_key_t &x, const dyn_tbl_key_t &y) const {
            return memcmp(x.key, y.key, MAX_KEYLEN)==0;
        }
    } dyn_tbl_key_eq;
    
    

    桶的数据结构

    包括A(i,j),V(i,j),L(i,j),e(i,j),T,maximum.

    /**
     * Bucket structure
     */
    typedef struct dyn_tbl_s {
    
        /// associative array: A(i,j)
        std::unordered_map<dyn_tbl_key_t, long long, dyn_tbl_key_hash, dyn_tbl_key_eq> array;
    
        /// total sum: V(i,j)
        long long total;
    
        /// maximum length of counters allowed, exceeding this value would trigger expansion: l(i, j)
        unsigned int max_len;
    
        /// total number of decrement: e(i, j)
        unsigned int decrement;
    
        /// expansion parameter: T
        long long T;
    
        /// maximum sum among keys, to speed up detection
        long long max_value;
    
        /***********************
         * read only members
         ***********************/
        /// length of keys
        unsigned int lgn;
    } dyn_tbl_t;
    
    

    初始化

    有参传参,无参置零

    dyn_tbl_t* dyn_tbl_init(unsigned int length, int lgn, long long T) {
        dyn_tbl_t* ret = (dyn_tbl_t*)calloc(1, sizeof(dyn_tbl_t));
        ret->lgn = lgn;
        ret->max_len = length;
        ret->decrement = 0;
        ret->total = 0;
        ret->T = T;
        ret->max_value = 0;
        return ret;
    }
    

    destroy

    free掉这个桶的内存空间

    void dyn_tbl_destroy(dyn_tbl_t* dyn_tbl) {
        free(dyn_tbl);
    }
    
    

    重置函数

    将桶的参数置零

    void dyn_tbl_reset(dyn_tbl_t* dyn_tbl) {
        dyn_tbl->array.clear();
        dyn_tbl->decrement = 0;
        dyn_tbl->total = 0;
        dyn_tbl->max_value = 0;
    }
    

    复制函数

    将桶的参数复制

    void dyn_tbl_copy(dyn_tbl_t* dyn_tbl_from, dyn_tbl_t* dyn_tbl_to) {
        dyn_tbl_to->array = dyn_tbl_from->array;
        dyn_tbl_to->decrement = dyn_tbl_from->decrement;
        dyn_tbl_to->total = dyn_tbl_from->total;
        dyn_tbl_to->max_len = dyn_tbl_from->max_len;
        dyn_tbl_to->max_value = dyn_tbl_from->max_value;
    }
    

    输出函数

    输出桶内的值,将值输出到文件中。主要输出的是桶内的IP地址的值。

    void dyn_tbl_print(dyn_tbl_t* dyn_tbl, const char* output) {
        FILE* fp;
    
    	// open a file
    	if ((fp = fopen(output, "w")) == NULL) {
    		fprintf(stderr, "ERR: cannot open %s
    ", output);
    		exit(-1);
    	}
    
        unsigned int len = dyn_tbl->array.size();
        fprintf(fp, "length: %u
    ", len);
        // for(std::unordered_map<dyn_tbl_key_t, long long>::iterator it = dyn_tbl->array.begin(); it != dyn_tbl->array.end(); ++it) {
        for(auto it = dyn_tbl->array.begin(); it != dyn_tbl->array.end(); ++it) {
            dyn_tbl_key_t key = it->first;
            if (dyn_tbl->lgn == 32) {
                char addr1[30];
                unsigned int* ptr = (unsigned int*) key.key;
                fprintf(fp, "%s %lld
    ", ip2a(*ptr, addr1), it->second);
            }
            else {
                char addr1[30];
                char addr2[30];
                unsigned int* ptr = (unsigned int*) key.key;
                fprintf(fp, "%s %s %lld
    ", ip2a(*ptr, addr1), ip2a(*(ptr+1), addr2), it->second);
            }
        }
    
    	// close the file
    	fclose(fp);
    }
    

    返回这个桶内的长度

    int dyn_tbl_length(dyn_tbl_t* dyn_tbl) {
        return dyn_tbl->array.size();
    }
    
  • 相关阅读:
    【转贴】Cookie + Session + OAuth + SSO
    zz淘宝商品库MySQL优化实践
    HIVE 数据倾斜调优总结zz
    数据挖掘笔记(一)
    hive函数参考手册
    hive QL(HQL)简明指南zz
    数据挖掘笔记(二)
    python format string (转)
    hive 中转义符使用问题
    关于文档管理
  • 原文地址:https://www.cnblogs.com/vancasola/p/9985053.html
Copyright © 2020-2023  润新知