redis原始碼閱讀-資料結構篇-hyperloglog
阿新 • • 發佈:2021-06-10
@
目錄5. HyperLogLog 實現 hyperloglog.c
資料結構定義
-
hllhdr
struct hllhdr { char magic[4]; /* "HYLL" */ uint8_t encoding; /* HLL_DENSE or HLL_SPARSE. */ uint8_t notused[3]; /* Reserved for future use, must be zero. */ uint8_t card[8]; /* Cached cardinality, little endian. */ uint8_t registers[]; /* Data bytes. */ };
-
一些巨集定義
/* The cached cardinality MSB is used to signal validity of the cached value. */ #define HLL_INVALIDATE_CACHE(hdr) (hdr)->card[0] |= (1<<7) #define HLL_VALID_CACHE(hdr) (((hdr)->card[0] & (1<<7)) == 0) #define HLL_P 14 /* The greater is P, the smaller the error. */ #define HLL_REGISTERS (1<<HLL_P) /* With P=14, 16384 registers. */ #define HLL_P_MASK (HLL_REGISTERS-1) /* Mask to index register. */ #define HLL_BITS 6 /* Enough to count up to 63 leading zeroes. */ #define HLL_REGISTER_MAX ((1<<HLL_BITS)-1) #define HLL_HDR_SIZE sizeof(struct hllhdr) #define HLL_DENSE_SIZE (HLL_HDR_SIZE+((HLL_REGISTERS*HLL_BITS+7)/8)) #define HLL_DENSE 0 /* Dense encoding. */ #define HLL_SPARSE 1 /* Sparse encoding. */ #define HLL_RAW 255 /* Only used internally, never exposed. */ #define HLL_MAX_ENCODING 1 static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected\r\n"; ...
Helper函式(可跳過,需要時閱讀)
-
雜湊函式 O(1)
uint64_t MurmurHash64A (const void * key, int len, unsigned int seed) { const uint64_t m = 0xc6a4a7935bd1e995; const int r = 47; uint64_t h = seed ^ (len * m); const uint8_t *data = (const uint8_t *)key; const uint8_t *end = data + (len-(len&7)); while(data != end) { uint64_t k; #if (BYTE_ORDER == LITTLE_ENDIAN) k = *((uint64_t*)data); #else k = (uint64_t) data[0]; k |= (uint64_t) data[1] << 8; k |= (uint64_t) data[2] << 16; k |= (uint64_t) data[3] << 24; k |= (uint64_t) data[4] << 32; k |= (uint64_t) data[5] << 40; k |= (uint64_t) data[6] << 48; k |= (uint64_t) data[7] << 56; #endif k *= m; k ^= k >> r; k *= m; h ^= k; h *= m; data += 8; } switch(len & 7) { case 7: h ^= (uint64_t)data[6] << 48; case 6: h ^= (uint64_t)data[5] << 40; case 5: h ^= (uint64_t)data[4] << 32; case 4: h ^= (uint64_t)data[3] << 24; case 3: h ^= (uint64_t)data[2] << 16; case 2: h ^= (uint64_t)data[1] << 8; case 1: h ^= (uint64_t)data[0]; h *= m; }; h ^= h >> r; h *= m; h ^= h >> r; return h; }
元素雜湊處理 O(1)
// 返回000...1的長度
// regp 儲存雜湊桶index
int hllPatLen(unsigned char *ele, size_t elesize, long *regp) {
uint64_t hash, bit, index;
int count;
// 獲取雜湊值
hash = MurmurHash64A(ele,elesize,0xadc83b19ULL);
index = hash & HLL_P_MASK; // (1 << 14)個桶的index
hash |= ((uint64_t)1<<63);// 確保至少一個1
bit = HLL_REGISTERS; // 第15位開始算起
count = 1; // 第一個出現1的rank
while((hash & bit) == 0) {// 直到找到1
count++;
bit <<= 1;
}
*regp = (int) index;// (1 << 14)個桶的index
return count;// 返回000...1的長度
}
新增基數 O(1)
int hllDenseAdd(uint8_t *registers, unsigned char *ele, size_t elesize) {
uint8_t oldcount, count;
long index;
// 元素雜湊處理
count = hllPatLen(ele,elesize,&index);
// 000...1的長度更長則新增
HLL_DENSE_GET_REGISTER(oldcount,registers,index);
if (count > oldcount) {
HLL_DENSE_SET_REGISTER(registers,index,count);
return 1;
} else {
return 0;
}
}
...
關於HyperLogLog可以詳見本人之前的部落格解析