intset 实现了一个数字元素的集合。
使用数组和元素的有序存放实现存取,查找过程使用二分查找法,所有插入删除的的效率为O(log2N)。 与其他数据结构类似,作者使用变编码方式实现对内存的高效利用。 初始化的intset中的数字定义为int16_t,即每个元素占用2个字节,而随着数据的插入,逐渐调整编码方式到int32_t或int64_t
上代码
intset.h
1 #ifndef __INTSET_H 2 #define __INTSET_H 3 #include <stdint.h> 4 5 typedef struct intset { 6 uint32_t encoding; 7 uint32_t length; 8 int8_t contents[]; 9 } intset; 10 11 intset *intsetNew(void); 12 intset *intsetAdd(intset *is, int64_t value, uint8_t *success); 13 intset *intsetRemove(intset *is, int64_t value, int *success); 14 uint8_t intsetFind(intset *is, int64_t value); 15 int64_t intsetRandom(intset *is); 16 uint8_t intsetGet(intset *is, uint32_t pos, int64_t *value); 17 uint32_t intsetLen(intset *is); 18 19 #endif // __INTSET_H
intset.c
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <string.h> 4 #include "intset.h" 5 #include "zmalloc.h" 6 7 /* Note that these encodings are ordered, so: 8 * INTSET_ENC_INT16 < INTSET_ENC_INT32 < INTSET_ENC_INT64. */ 9 #define INTSET_ENC_INT16 (sizeof(int16_t)) 10 #define INTSET_ENC_INT32 (sizeof(int32_t)) 11 #define INTSET_ENC_INT64 (sizeof(int64_t)) 12 13 /* Return the required encoding for the provided value. */ 14 //根据v的值判断使用哪儿种int以节省内存 15 static uint8_t _intsetValueEncoding(int64_t v) { 16 if (v < INT32_MIN || v > INT32_MAX) 17 return INTSET_ENC_INT64; 18 else if (v < INT16_MIN || v > INT16_MAX) 19 return INTSET_ENC_INT32; 20 return INTSET_ENC_INT16; 21 } 22 23 //得到enc编码方式下的第pos个位置的值 24 /* Return the value at pos, given an encoding. */ 25 static int64_t _intsetGetEncoded(intset *is, int pos, uint8_t enc) { 26 if (enc == INTSET_ENC_INT64) 27 return ((int64_t*)is->contents)[pos]; 28 else if (enc == INTSET_ENC_INT32) 29 return ((int32_t*)is->contents)[pos]; 30 return ((int16_t*)is->contents)[pos]; 31 } 32 33 //得到is中第pos个位置的value 34 /* Return the value at pos, using the configured encoding. */ 35 static int64_t _intsetGet(intset *is, int pos) { 36 return _intsetGetEncoded(is,pos,is->encoding); 37 } 38 39 //在第n个pos个位置存放value 40 /* Set the value at pos, using the configured encoding. */ 41 static void _intsetSet(intset *is, int pos, int64_t value) { 42 if (is->encoding == INTSET_ENC_INT64) 43 ((int64_t*)is->contents)[pos] = value; 44 else if (is->encoding == INTSET_ENC_INT32) 45 ((int32_t*)is->contents)[pos] = value; 46 else 47 ((int16_t*)is->contents)[pos] = value; 48 } 49 50 //得到一个新的intset,编码采用INTSET_ENC_INT16 51 /* Create an empty intset. */ 52 intset *intsetNew(void) { 53 intset *is = zmalloc(sizeof(intset)); 54 is->encoding = INTSET_ENC_INT16; 55 is->length = 0; 56 return is; 57 } 58 59 /* Resize the intset */ 60 //重新为intset分配内存 61 static intset *intsetResize(intset *is, uint32_t len) { 62 uint32_t size = len*is->encoding; 63 is = zrealloc(is,sizeof(intset)+size); 64 return is; 65 } 66 67 68 //在排好序的iniset中查找value 69 //如果找到,返回1,*pos为其所在位置 70 //如果找不到,返回0,*pos为其能插入的位置 71 //二分查找法 72 /* Search for the position of "value". Return 1 when the value was found and 73 * sets "pos" to the position of the value within the intset. Return 0 when 74 * the value is not present in the intset and sets "pos" to the position 75 * where "value" can be inserted. */ 76 static uint8_t intsetSearch(intset *is, int64_t value, uint32_t *pos) { 77 int min = 0, max = is->length-1, mid = -1; 78 int64_t cur = -1; 79 80 /* The value can never be found when the set is empty */ 81 if (is->length == 0) { 82 if (pos) *pos = 0; 83 return 0; 84 } else { 85 /* Check for the case where we know we cannot find the value, 86 * but do know the insert position. */ 87 //如果插入值大于最后一个元素,或者小于第一个元素,则可以认定无法找到该元素 88 if (value > _intsetGet(is,is->length-1)) { 89 if (pos) *pos = is->length; 90 return 0; 91 } else if (value < _intsetGet(is,0)) { 92 if (pos) *pos = 0; 93 return 0; 94 } 95 } 96 97 while(max >= min) { 98 mid = (min+max)/2; 99 cur = _intsetGet(is,mid); 100 if (value > cur) { 101 min = mid+1; 102 } else if (value < cur) { 103 max = mid-1; 104 } else { 105 break; 106 } 107 } 108 109 if (value == cur) { 110 if (pos) *pos = mid; 111 return 1; 112 } else { 113 if (pos) *pos = min; 114 return 0; 115 } 116 } 117 118 //升级intset的编码,并且出入一个新数字,因为新数字的绝对值一定大于 119 //当前intset中所有的数字的绝对值,如果是负数,则最小,放在最前边,否则放在最后边 120 /* Upgrades the intset to a larger encoding and inserts the given integer. */ 121 static intset *intsetUpgradeAndAdd(intset *is, int64_t value) { 122 uint8_t curenc = is->encoding; 123 uint8_t newenc = _intsetValueEncoding(value); 124 int length = is->length; 125 int prepend = value < 0 ? 1 : 0; 126 127 /* First set new encoding and resize */ 128 is->encoding = newenc; 129 is = intsetResize(is,is->length+1); 130 131 /* Upgrade back-to-front so we don't overwrite values. 132 * Note that the "prepend" variable is used to make sure we have an empty 133 * space at either the beginning or the end of the intset. */ 134 while(length--) 135 _intsetSet(is,length+prepend,_intsetGetEncoded(is,length,curenc)); 136 137 /* Set the value at the beginning or the end. */ 138 if (prepend) 139 _intsetSet(is,0,value); 140 else 141 _intsetSet(is,is->length,value); 142 is->length++; 143 return is; 144 } 145 146 //把from开始直到最后intset最后的内容move到to开始的地方,在这之前,应该要resize一下 147 static void intsetMoveTail(intset *is, uint32_t from, uint32_t to) { 148 void *src, *dst; 149 uint32_t bytes = is->length-from; 150 if (is->encoding == INTSET_ENC_INT64) { 151 src = (int64_t*)is->contents+from; 152 dst = (int64_t*)is->contents+to; 153 bytes *= sizeof(int64_t); 154 } else if (is->encoding == INTSET_ENC_INT32) { 155 src = (int32_t*)is->contents+from; 156 dst = (int32_t*)is->contents+to; 157 bytes *= sizeof(int32_t); 158 } else { 159 src = (int16_t*)is->contents+from; 160 dst = (int16_t*)is->contents+to; 161 bytes *= sizeof(int16_t); 162 } 163 memmove(dst,src,bytes); 164 } 165 166 //在intset中插入一个元素,如果返回0,表示已经存在,否则返回1 167 /* Insert an integer in the intset */ 168 intset *intsetAdd(intset *is, int64_t value, uint8_t *success) { 169 uint8_t valenc = _intsetValueEncoding(value); 170 uint32_t pos; 171 if (success) *success = 1; 172 173 /* Upgrade encoding if necessary. If we need to upgrade, we know that 174 * this value should be either appended (if > 0) or prepended (if < 0), 175 * because it lies outside the range of existing values. */ 176 if (valenc > is->encoding) { 177 /* This always succeeds, so we don't need to curry *success. */ 178 return intsetUpgradeAndAdd(is,value); 179 } else { 180 /* Abort if the value is already present in the set. 181 * This call will populate "pos" with the right position to insert 182 * the value when it cannot be found. */ 183 if (intsetSearch(is,value,&pos)) { 184 if (success) *success = 0; 185 return is; 186 } 187 188 is = intsetResize(is,is->length+1); 189 if (pos < is->length) intsetMoveTail(is,pos,pos+1); 190 } 191 192 _intsetSet(is,pos,value); 193 is->length++; 194 return is; 195 } 196 197 //在intset中删除一个元素value 198 /* Delete integer from intset */ 199 intset *intsetRemove(intset *is, int64_t value, int *success) { 200 uint8_t valenc = _intsetValueEncoding(value); 201 uint32_t pos; 202 if (success) *success = 0; 203 204 if (valenc <= is->encoding && intsetSearch(is,value,&pos)) { 205 /* We know we can delete */ 206 if (success) *success = 1; 207 208 /* Overwrite value with tail and update length */ 209 if (pos < (is->length-1)) intsetMoveTail(is,pos+1,pos); 210 is = intsetResize(is,is->length-1); 211 is->length--; 212 } 213 return is; 214 } 215 216 //判断value是否在is中 217 /* Determine whether a value belongs to this set */ 218 uint8_t intsetFind(intset *is, int64_t value) { 219 uint8_t valenc = _intsetValueEncoding(value); 220 return valenc <= is->encoding && intsetSearch(is,value,NULL); 221 } 222 223 //随机取一个intset中的元素 224 /* Return random member */ 225 int64_t intsetRandom(intset *is) { 226 return _intsetGet(is,rand()%is->length); 227 } 228 229 //把pos位置上的元素取出,如果超出位置,返回0,找到了则返回1,*value为其值 230 /* Sets the value to the value at the given position. When this position is 231 * out of range the function returns 0, when in range it returns 1. */ 232 uint8_t intsetGet(intset *is, uint32_t pos, int64_t *value) { 233 if (pos < is->length) { 234 *value = _intsetGet(is,pos); 235 return 1; 236 } 237 return 0; 238 } 239 240 //得到intset的元素数量 241 /* Return intset length */ 242 uint32_t intsetLen(intset *is) { 243 return is->length; 244 } 245 246 #ifdef INTSET_TEST_MAIN 247 #include <sys/time.h> 248 249 void intsetRepr(intset *is) { 250 int i; 251 for (i = 0; i < is->length; i++) { 252 printf("%lld\n", (uint64_t)_intsetGet(is,i)); 253 } 254 printf("\n"); 255 } 256 257 void error(char *err) { 258 printf("%s\n", err); 259 exit(1); 260 } 261 262 void ok(void) { 263 printf("OK\n"); 264 } 265 266 long long usec(void) { 267 struct timeval tv; 268 gettimeofday(&tv,NULL); 269 return (((long long)tv.tv_sec)*1000000)+tv.tv_usec; 270 } 271 272 #define assert(_e) ((_e)?(void)0:(_assert(#_e,__FILE__,__LINE__),exit(1))) 273 void _assert(char *estr, char *file, int line) { 274 printf("\n\n=== ASSERTION FAILED ===\n"); 275 printf("==> %s:%d '%s' is not true\n",file,line,estr); 276 } 277 278 intset *createSet(int bits, int size) { 279 uint64_t mask = (1<<bits)-1; 280 uint64_t i, value; 281 intset *is = intsetNew(); 282 283 for (i = 0; i < size; i++) { 284 if (bits > 32) { 285 value = (rand()*rand()) & mask; 286 } else { 287 value = rand() & mask; 288 } 289 is = intsetAdd(is,value,NULL); 290 } 291 return is; 292 } 293 294 void checkConsistency(intset *is) { 295 int i; 296 297 for (i = 0; i < (is->length-1); i++) { 298 if (is->encoding == INTSET_ENC_INT16) { 299 int16_t *i16 = (int16_t*)is->contents; 300 assert(i16[i] < i16[i+1]); 301 } else if (is->encoding == INTSET_ENC_INT32) { 302 int32_t *i32 = (int32_t*)is->contents; 303 assert(i32[i] < i32[i+1]); 304 } else { 305 int64_t *i64 = (int64_t*)is->contents; 306 assert(i64[i] < i64[i+1]); 307 } 308 } 309 } 310 311 int main(int argc, char **argv) { 312 uint8_t success; 313 int i; 314 intset *is; 315 sranddev(); 316 317 printf("Value encodings: "); { 318 assert(_intsetValueEncoding(-32768) == INTSET_ENC_INT16); 319 assert(_intsetValueEncoding(+32767) == INTSET_ENC_INT16); 320 assert(_intsetValueEncoding(-32769) == INTSET_ENC_INT32); 321 assert(_intsetValueEncoding(+32768) == INTSET_ENC_INT32); 322 assert(_intsetValueEncoding(-2147483648) == INTSET_ENC_INT32); 323 assert(_intsetValueEncoding(+2147483647) == INTSET_ENC_INT32); 324 assert(_intsetValueEncoding(-2147483649) == INTSET_ENC_INT64); 325 assert(_intsetValueEncoding(+2147483648) == INTSET_ENC_INT64); 326 assert(_intsetValueEncoding(-9223372036854775808ull) == INTSET_ENC_INT64); 327 assert(_intsetValueEncoding(+9223372036854775807ull) == INTSET_ENC_INT64); 328 ok(); 329 } 330 331 printf("Basic adding: "); { 332 is = intsetNew(); 333 is = intsetAdd(is,5,&success); assert(success); 334 is = intsetAdd(is,6,&success); assert(success); 335 is = intsetAdd(is,4,&success); assert(success); 336 is = intsetAdd(is,4,&success); assert(!success); 337 ok(); 338 } 339 340 printf("Large number of random adds: "); { 341 int inserts = 0; 342 is = intsetNew(); 343 for (i = 0; i < 1024; i++) { 344 is = intsetAdd(is,rand()%0x800,&success); 345 if (success) inserts++; 346 } 347 assert(is->length == inserts); 348 checkConsistency(is); 349 ok(); 350 } 351 352 printf("Upgrade from int16 to int32: "); { 353 is = intsetNew(); 354 is = intsetAdd(is,32,NULL); 355 assert(is->encoding == INTSET_ENC_INT16); 356 is = intsetAdd(is,65535,NULL); 357 assert(is->encoding == INTSET_ENC_INT32); 358 assert(intsetFind(is,32)); 359 assert(intsetFind(is,65535)); 360 checkConsistency(is); 361 362 is = intsetNew(); 363 is = intsetAdd(is,32,NULL); 364 assert(is->encoding == INTSET_ENC_INT16); 365 is = intsetAdd(is,-65535,NULL); 366 assert(is->encoding == INTSET_ENC_INT32); 367 assert(intsetFind(is,32)); 368 assert(intsetFind(is,-65535)); 369 checkConsistency(is); 370 ok(); 371 } 372 373 printf("Upgrade from int16 to int64: "); { 374 is = intsetNew(); 375 is = intsetAdd(is,32,NULL); 376 assert(is->encoding == INTSET_ENC_INT16); 377 is = intsetAdd(is,4294967295,NULL); 378 assert(is->encoding == INTSET_ENC_INT64); 379 assert(intsetFind(is,32)); 380 assert(intsetFind(is,4294967295)); 381 checkConsistency(is); 382 383 is = intsetNew(); 384 is = intsetAdd(is,32,NULL); 385 assert(is->encoding == INTSET_ENC_INT16); 386 is = intsetAdd(is,-4294967295,NULL); 387 assert(is->encoding == INTSET_ENC_INT64); 388 assert(intsetFind(is,32)); 389 assert(intsetFind(is,-4294967295)); 390 checkConsistency(is); 391 ok(); 392 } 393 394 printf("Upgrade from int32 to int64: "); { 395 is = intsetNew(); 396 is = intsetAdd(is,65535,NULL); 397 assert(is->encoding == INTSET_ENC_INT32); 398 is = intsetAdd(is,4294967295,NULL); 399 assert(is->encoding == INTSET_ENC_INT64); 400 assert(intsetFind(is,65535)); 401 assert(intsetFind(is,4294967295)); 402 checkConsistency(is); 403 404 is = intsetNew(); 405 is = intsetAdd(is,65535,NULL); 406 assert(is->encoding == INTSET_ENC_INT32); 407 is = intsetAdd(is,-4294967295,NULL); 408 assert(is->encoding == INTSET_ENC_INT64); 409 assert(intsetFind(is,65535)); 410 assert(intsetFind(is,-4294967295)); 411 checkConsistency(is); 412 ok(); 413 } 414 415 printf("Stress lookups: "); { 416 long num = 100000, size = 10000; 417 int i, bits = 20; 418 long long start; 419 is = createSet(bits,size); 420 checkConsistency(is); 421 422 start = usec(); 423 for (i = 0; i < num; i++) intsetSearch(is,rand() % ((1<<bits)-1),NULL); 424 printf("%ld lookups, %ld element set, %lldusec\n",num,size,usec()-start); 425 } 426 427 printf("Stress add+delete: "); { 428 int i, v1, v2; 429 is = intsetNew(); 430 for (i = 0; i < 0xffff; i++) { 431 v1 = rand() % 0xfff; 432 is = intsetAdd(is,v1,NULL); 433 assert(intsetFind(is,v1)); 434 435 v2 = rand() % 0xfff; 436 is = intsetRemove(is,v2,NULL); 437 assert(!intsetFind(is,v2)); 438 } 439 checkConsistency(is); 440 ok(); 441 } 442 } 443 #endif