1. 程式人生 > 其它 >redis原始碼閱讀-資料結構篇-hyperloglog

redis原始碼閱讀-資料結構篇-hyperloglog

@

目錄

5. HyperLogLog 實現 hyperloglog.c

資料結構定義

  • hllhdr

    struct hllhdr {
        char magic[4];      /* "HYLL" */
        uint8_t encoding;   /* HLL_DENSE or HLL_SPARSE. */
        uint8_t notused[3]; /* Reserved for future use, must be zero. */
        uint8_t card[8];    /* Cached cardinality, little endian. */
        uint8_t registers[]; /* Data bytes. */
    };
    
  • 一些巨集定義

    /* The cached cardinality MSB is used to signal validity of the cached value. */
    #define HLL_INVALIDATE_CACHE(hdr) (hdr)->card[0] |= (1<<7)
    #define HLL_VALID_CACHE(hdr) (((hdr)->card[0] & (1<<7)) == 0)
    
    #define HLL_P 14 /* The greater is P, the smaller the error. */
    #define HLL_REGISTERS (1<<HLL_P) /* With P=14, 16384 registers. */
    #define HLL_P_MASK (HLL_REGISTERS-1) /* Mask to index register. */
    #define HLL_BITS 6 /* Enough to count up to 63 leading zeroes. */
    #define HLL_REGISTER_MAX ((1<<HLL_BITS)-1)
    #define HLL_HDR_SIZE sizeof(struct hllhdr)
    #define HLL_DENSE_SIZE (HLL_HDR_SIZE+((HLL_REGISTERS*HLL_BITS+7)/8))
    #define HLL_DENSE 0 /* Dense encoding. */
    #define HLL_SPARSE 1 /* Sparse encoding. */
    #define HLL_RAW 255 /* Only used internally, never exposed. */
    #define HLL_MAX_ENCODING 1
    
    static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected\r\n";
    ...
    

Helper函式(可跳過,需要時閱讀)

  • 雜湊函式 O(1)

    uint64_t MurmurHash64A (const void * key, int len, unsigned int seed) {
        const uint64_t m = 0xc6a4a7935bd1e995;
        const int r = 47;
        uint64_t h = seed ^ (len * m);
        const uint8_t *data = (const uint8_t *)key;
        const uint8_t *end = data + (len-(len&7));
    
        while(data != end) {
            uint64_t k;
    #if (BYTE_ORDER == LITTLE_ENDIAN)
            k = *((uint64_t*)data);
    #else
            k = (uint64_t) data[0];
            k |= (uint64_t) data[1] << 8;
            k |= (uint64_t) data[2] << 16;
            k |= (uint64_t) data[3] << 24;
            k |= (uint64_t) data[4] << 32;
            k |= (uint64_t) data[5] << 40;
            k |= (uint64_t) data[6] << 48;
            k |= (uint64_t) data[7] << 56;
    #endif
            k *= m;
            k ^= k >> r;
            k *= m;
            h ^= k;
            h *= m;
            data += 8;
        }
        switch(len & 7) {
        case 7: h ^= (uint64_t)data[6] << 48;
        case 6: h ^= (uint64_t)data[5] << 40;
        case 5: h ^= (uint64_t)data[4] << 32;
        case 4: h ^= (uint64_t)data[3] << 24;
        case 3: h ^= (uint64_t)data[2] << 16;
        case 2: h ^= (uint64_t)data[1] << 8;
        case 1: h ^= (uint64_t)data[0];
                h *= m;
        };
        h ^= h >> r;
        h *= m;
        h ^= h >> r;
        return h;
    }
    

元素雜湊處理 O(1)

// 返回000...1的長度
// regp 儲存雜湊桶index
int hllPatLen(unsigned char *ele, size_t elesize, long *regp) {
    uint64_t hash, bit, index;
    int count;
	// 獲取雜湊值
    hash = MurmurHash64A(ele,elesize,0xadc83b19ULL);
    index = hash & HLL_P_MASK; // (1 << 14)個桶的index
    hash |= ((uint64_t)1<<63);// 確保至少一個1
    bit = HLL_REGISTERS; // 第15位開始算起
    count = 1; // 第一個出現1的rank
    while((hash & bit) == 0) {// 直到找到1
        count++;
        bit <<= 1;
    }
    *regp = (int) index;// (1 << 14)個桶的index
    return count;// 返回000...1的長度
}

新增基數 O(1)

int hllDenseAdd(uint8_t *registers, unsigned char *ele, size_t elesize) {
    uint8_t oldcount, count;
    long index;
    // 元素雜湊處理
    count = hllPatLen(ele,elesize,&index);
    // 000...1的長度更長則新增
    HLL_DENSE_GET_REGISTER(oldcount,registers,index);
    if (count > oldcount) {
        HLL_DENSE_SET_REGISTER(registers,index,count);
        return 1;
    } else {
        return 0;
    }
}

...

關於HyperLogLog可以詳見本人之前的部落格解析