day25之布隆過濾器的實現和優缺點以及擴充套件

阿新 • • 發佈：2019-02-01

布隆過濾器（Bloom Filter）是1970年由布隆提出的。它實際上是一個很長的二進位制向量和一系列隨機對映函式。布隆過濾器可以用於檢索一個元素是否在一個集合中。它的優點是空間效率和查詢時間都遠遠超過一般的演算法，缺點是有一定的誤識別率和刪除困難。

程式碼實現：

//bitMap.h
class BitMap
{
public:
    BitMap()
    {}

    BitMap(size_t size)
    {
        _table.resize((size>>5)+1);
    }

    // 1
    void Set(int data)
    {
        size_t byteNo = data>>5 
;
        size_t bitNo = data%32;

        _table[byteNo] |= (1 << bitNo); 
    }

    // data所在位元組的位元位置0
    void ReSet(int data)
    {
        size_t byteNo = data>>5;
        size_t bitNo = data%32;
        _table[byteNo] &= ~(1<<bitNo);
    }

    // 檢測是否存在
    bool Test(int data)
    {
        size_t byteNo = data>>5 
;
        size_t bitNo = data%32;

        if ((1<<bitNo) & _table[byteNo])
            return true;

        return false;
    }

private:
    std::vector<int> _table;
};

//common.h 布隆過濾器用到的雜湊函式
#pragma  once
#include<string>
#include<iostream>
using namespace std;

size_t GetNextPrim(size_t prev)//prev = 10 

{
    const int _PrimeSize = 28;
    static const unsigned long _PrimeList [_PrimeSize] =
    {
        53ul,         97ul,         193ul,       389ul,       769ul,
        1543ul,       3079ul,       6151ul,      12289ul,     24593ul,
        49157ul,      98317ul,      196613ul,    393241ul,    786433ul,
        1572869ul,    3145739ul,    6291469ul,   12582917ul,  25165843ul,
        50331653ul,   100663319ul,  201326611ul, 402653189ul, 805306457ul,
        1610612741ul, 3221225473ul, 4294967291ul
    };


    for (size_t idx = 0; idx < _PrimeSize; ++idx)
    {
        if(prev < _PrimeList[idx])
            return _PrimeList[idx];
    }

    return -1;
}



static size_t BKDRHash(const char * str)
{
    unsigned int seed = 131; // 31 131 1313 13131 131313
    unsigned int hash = 0;
    while (*str )
    {
        hash = hash * seed + (*str++);
    }
    return (hash & 0x7FFFFFFF);
}


size_t SDBMHash(const char* str)
{
    register size_t hash = 0;
    while(size_t ch = (size_t)*str++)
    {
        hash = 65599*hash+ch;
        hash = (size_t)ch+(hash<<6)+ (hash<<16)-hash;
    }

    return hash;
}

size_t RSHash(const char *str)
{
    register size_t hash = 0;
    size_t magic = 63689;
    while(size_t ch = (size_t)*str++)
    {
        hash = hash * magic +ch;
        magic *= 378551;
    }

    return hash;
}

size_t APHash(const char* str)
{
    register size_t hash = 0;
    size_t ch;
    for (long i = 0; ch = (size_t)*str++; i++)
    {
        if (0 == (i&1))
        {
            hash ^= ((hash << 7) ^ (hash >> 3));
        }
        else
        {
            hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
        }
    }

    return hash;
}

size_t JSHash(const char* str)
{
    if (!*str)
        return 0;

    register size_t hash = 1315423911;
    while (size_t ch = (size_t)*str++)
    {
        hash ^= ((hash << 5) + ch + (hash >> 2));
    }

    return hash;
}



struct _HashFunc1
{
    size_t operator()(const string& key)
    {
        return BKDRHash(key.c_str());
    }
};


struct _HashFunc2
{
    size_t operator()(const string& key)
    {
        return SDBMHash(key.c_str());
    }
};


struct _HashFunc3
{
    size_t operator()(const string& key)
    {
        return RSHash(key.c_str());
    }
};


struct _HashFunc4
{
    size_t operator()(const string& key)
    {
        return APHash(key.c_str());
    }
};


struct _HashFunc5
{
    size_t operator()(const string& key)
    {
        return JSHash(key.c_str());
    }
};

#include "bitMap.h"
#include "common.h"

//雜湊函式的用途就是將其他型別的元素轉換為整型。
template<class K=string, class Hash1 = _HashFunc1, class Hash2 = _HashFunc1
,class Hash3 = _HashFunc3, class Hash4 = _HashFunc4, class Hash5 = _HashFunc5 >

class BloomFilter
{
public:

    BloomFilter(int size = 10)
    {
        _capacity = GetNextPrim(size);
        _bitmap = new BitMap(_capacity);  //設定點陣圖大小。
    }

    void Set(const K & key)
    {
        _bitmap->Set( Hash1()(key) % _capacity ); //可能通過雜湊函式計算的整數超過了點陣圖的位數，所以對求模。
        _bitmap->Set( Hash2()(key) % _capacity ); //可能通過雜湊函式計算的整數超過了點陣圖的位數，所以對求模。
        _bitmap->Set( Hash3()(key) % _capacity ); //可能通過雜湊函式計算的整數超過了點陣圖的位數，所以對求模。
        _bitmap->Set( Hash4()(key) % _capacity ); //可能通過雜湊函式計算的整數超過了點陣圖的位數，所以對求模。
        _bitmap->Set( Hash5()(key) % _capacity ); //可能通過雜湊函式計算的整數超過了點陣圖的位數，所以對求模。
    }

    //布隆過濾器只要有一個雜湊函式計算到不在點陣圖中，這個元素就不在，相反如果計算後都在也有可能不在
    bool Find(const K & key)
    {
        if ( !_bitmap->Test(Hash1()(key) % _capacity))
        {
            return false;
        }
        if ( !_bitmap->Test(Hash2()(key) % _capacity))
        {
            return false;
        }
        if ( !_bitmap->Test(Hash3()(key) % _capacity))
        {
            return false;
        }
        if ( !_bitmap->Test(Hash4()(key) % _capacity))
        {
            return false;
        }
        if ( !_bitmap->Test(Hash5()(key) % _capacity))
        {
            return false;
        }
        return true;
    }


private:
    BitMap *_bitmap;
    int _capacity;       
};

布隆過濾器的優點：
布隆過濾器儲存空間插入和查詢都是O(1),另外, Hash函式相互之間沒有關係，方便由硬體並行實現。布隆過濾器不需要儲存元素本身，在某些對保密要求非常嚴格的場合有優勢。
缺點：
誤算率是其中之一。隨著存入的元素數量增加，誤算率隨之增加
另外，一般情況下不能從布隆過濾器中刪除元素

如何擴充套件BloomFilter使得它支援刪除元素的操作？
因為一個布隆過濾器的key對應多個位，衝突的概率比較大，所以不支援刪除，因為刪除有可能影響到其他元素。如果要對其元素進行刪除，就不得不對每一個位進行引用計數。

#include "common.h"

//雜湊函式的用途就是將其他型別的元素轉換為整型。
template<class K=string, class Hash1 = _HashFunc1, class Hash2 = _HashFunc1
    ,class Hash3 = _HashFunc3, class Hash4 = _HashFunc4, class Hash5 = _HashFunc5 >

class BloomFilterCount
{
public:

    BloomFilterCount(int size = 10)
    {
        _capacity = GetNextPrim(size);
        _refbm.resize(_capacity);
    }

    void Set(const K & key)
    {
        size_t hash1=( Hash1()(key) % _capacity ); //可能通過雜湊函式計算的整數超過了點陣圖的位數，所以對求模。
        size_t hash2=( Hash2()(key) % _capacity ); //可能通過雜湊函式計算的整數超過了點陣圖的位數，所以對求模。
        size_t hash3=( Hash3()(key) % _capacity ); //可能通過雜湊函式計算的整數超過了點陣圖的位數，所以對求模。
        size_t hash4=( Hash4()(key) % _capacity ); //可能通過雜湊函式計算的整數超過了點陣圖的位數，所以對求模。
        size_t hash5=( Hash5()(key) % _capacity ); //可能通過雜湊函式計算的整數超過了點陣圖的位數，所以對求模。

        _refbm[hash1]++;  //對這個位引用計數加加。
        _refbm[hash2]++;
        _refbm[hash3]++;
        _refbm[hash4]++;
        _refbm[hash5]++;
    }

    void ReSit(const K &key)
    {
        size_t hash1=( Hash1()(key) % _capacity );
        size_t hash2=( Hash2()(key) % _capacity );
        size_t hash3=( Hash3()(key) % _capacity );
        size_t hash4=( Hash4()(key) % _capacity );
        size_t hash5=( Hash5()(key) % _capacity );

        _refbm[hash1]--;  //對這個位引用計數減減。
        _refbm[hash2]--;
        _refbm[hash3]--;
        _refbm[hash4]--;
        _refbm[hash5]--;

    }


    //布隆過濾器只要有一個雜湊函式計算到不在點陣圖中，這個元素就不在
    bool Find(const K & key)
    {
        size_t hash1=( Hash1()(key) % _capacity );
        size_t hash2=( Hash2()(key) % _capacity );
        size_t hash3=( Hash3()(key) % _capacity );
        size_t hash4=( Hash4()(key) % _capacity );
        size_t hash5=( Hash5()(key) % _capacity );
        if (_refbm[hash1] <= 0)
            return false;
        if (_refbm[hash2] <= 0)
            return false;
        if (_refbm[hash3] <= 0)
            return false;
        if (_refbm[hash4] <= 0)
            return false;
        if (_refbm[hash5] <= 0)
            return false;
        return true;
    }


//我們都知道，點陣圖非常的節省空間，但由於每一位都要引入一個int,所以空間浪費還是比較嚴重的，
    //因此不得不放棄點陣圖了
private:
       vector<size_t> _refbm;
       size_t _capacity;
};

day25之布隆過濾器的實現和優缺點以及擴充套件

day25之布隆過濾器的實現和優缺點以及擴充套件

網路爬蟲：URL去重策略之布隆過濾器(BloomFilter)的使用

BloomFilter(布隆過濾器)原理和python支援庫

redis 5 HyperLogLog 布隆過濾器 GeoHash 和 scan

【實戰問題】-- 快取穿透之布隆過濾器（1）

使用MR編程hbase和hbase調優-布隆過濾器

布隆過濾器之Python+Redis

大量資料去重：Bitmap點陣圖演算法和布隆過濾器(Bloom Filter)

布隆過濾器go實現

基於redis 實現布隆過濾器

[原創]大資料:布隆過濾器C#版簡單實現。

布隆過濾器一致雜湊雜湊函式和雜湊表

布隆過濾器，原理+案例+程式碼實現

布隆過濾器（Bloom Filter）（給兩個檔案，分別有100億個字串，我們只要1g的記憶體，如何找到兩個檔案的交集？分別給出精確演算法和近似演算法？）

布隆過濾器和海量資料面試題

url去重 --布隆過濾器 bloom filter原理及python實現

以太坊：事件、日誌和布隆過濾器

布隆過濾器的原理、使用場景和注意事項

布隆過濾器和Hyperloglog基數統計的介紹

布隆過濾器（Bloom Filter）的簡單實現

day25之布隆過濾器的實現和優缺點以及擴充套件

相關推薦