BloomFilter布隆過濾器

阿新 • • 發佈：2019-01-20

1、簡介：
BloomFilter是一個很長的二進位制向量和一系列隨機對映函式。布隆過濾器可以用於檢索一個元素是否在一個集合中。它的優點是空間效率和查詢時間都遠遠超過一般的演算法，缺點是有一定的誤識別率和刪除困難。
2、應用：
要判斷一個元素是否在一個集合中出現，一般情況下就是將這個集合的元素儲存下來，然後再到這個集合中一一比較即可，但是如果這個集合中的元素很多的話，不僅需要的記憶體很大，而且查詢起來也比較慢。
為了提高效率我們可以採用hash表，並且將集合中的元素都對映到bitmap中的一個位上，這樣的話就會節省空間和查詢的時間。但是由於雜湊衝突的原因，我們有可能會產生誤判，即不同的元素經過雜湊函式之後可能產生同一個地址。
為了降低誤判率，我們可以將一個元素經過多個雜湊函式，對映到多個位上，如果這幾個位上都為1，我們就認為這個元素是存在的，但是隻要有一個位為0，那麼這個元素就一定不存在。所以對於布隆過濾器來說，它的存在是不準確的，但是它的不存在一定是準確的。
3、布隆過濾器的刪除
布隆過濾器是用幾個位來表示一個數據的，如果要刪除該資料，那麼必須要將該資料對應的每一位置0，這必然會影響到別的資料，為此我們可以用引用計數的方式來記錄該位置被標記的次數

#pragma once
#include <iostream>
using namespace std;
#include "Bitmap.h"

struct __HashFunc1
{
    size_t BKDRHash(const char *str)
    {
        register size_t hash = 0;
        while (size_t ch = (size_t)*str++)
        {
            hash = hash * 131 + ch;
        }
        return hash;
    }

    size_t operator 
()(const string& s)
    {
        return BKDRHash(s.c_str());
    }
};

struct __HashFunc2
{
    size_t SDBMHash(const char *str)
    {
        register size_t hash = 0;
        while (size_t ch = (size_t)*str++)
        {
            hash = 65599 * hash + ch;
            //hash = (size_t)ch + (hash << 6) + (hash << 16) - hash;   

        }
        return hash;
    }

    size_t operator()(const string& s)
    {
        return SDBMHash(s.c_str());
    }
};

struct __HashFunc3
{
    size_t RSHash(const char *str)
    {
        register size_t hash = 0;
        size_t magic = 63689;
        while (size_t ch = (size_t)*str++)
        {
            hash = hash * magic + ch;
            magic *= 378551;
        }
        return hash;
    }

    size_t operator()(const string& s)
    {
        return RSHash(s.c_str());
    }
};

struct __HashFunc4
{
    size_t APHash(const char *str)
    {
        register size_t hash = 0;
        size_t ch;
        for (long i = 0; ch = (size_t)*str++; i++)
        {
            if ((i & 1) == 0)
            {
                hash ^= ((hash << 7) ^ ch ^ (hash >> 3));
            }
            else
            {
                hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
            }
        }
        return hash;
    }

    size_t operator()(const string& s)
    {
        return APHash(s.c_str());
    }
};

struct __HashFunc5
{
    size_t JSHash(const char *str)
    {
        if (!*str)        // 這是由本人新增，以保證空字串返回雜湊值0  
            return 0;
        register size_t hash = 1315423911;
        while (size_t ch = (size_t)*str++)
        {
            hash ^= ((hash << 5) + ch + (hash >> 2));
        }
        return hash;
    }

    size_t operator()(const string& s)
    {
        return JSHash(s.c_str());
    }
};




template<class K = string,
class HashFunc1 = __HashFunc1,
class HashFunc2 = __HashFunc2,
class HashFunc3 = __HashFunc3,
class HashFunc4 = __HashFunc4,
class HashFunc5 = __HashFunc5>
class BloomFilter
{
public:
    // n表示儲存n個key
    BloomFilter(size_t n)
        :_bs(10 * n)
        , _range(10 * n)
    {}

    void Set(const K& key)
    {
        size_t hash1 = HashFunc1()(key);
        size_t hash2 = HashFunc2()(key);
        size_t hash3 = HashFunc3()(key);
        size_t hash4 = HashFunc4()(key);
        size_t hash5 = HashFunc5()(key);

        _bs.Set(hash1 % _range);
        _bs.Set(hash2 % _range);
        _bs.Set(hash3 % _range);
        _bs.Set(hash4 % _range);
        _bs.Set(hash5 % _range);
    }

    bool Test(const K& key)
    {
        size_t hash1 = HashFunc1()(key);
        if (_bs.Test(hash1%_range) == false)
            return false;

        size_t hash2 = HashFunc2()(key);
        if (_bs.Test(hash2%_range) == false)
            return false;

        size_t hash3 = HashFunc3()(key);
        if (_bs.Test(hash3%_range) == false)
            return false;

        size_t hash4 = HashFunc4()(key);
        if (_bs.Test(hash4%_range) == false)
            return false;

        size_t hash5 = HashFunc5()(key);
        if (_bs.Test(hash5%_range) == false)
            return false;

        return true; // 可能存在誤判
    }

protected:
    BitSet _bs;
    size_t _range;
};


void TestBloomFilter()
{
    BloomFilter<> bf(100);
    string url1 = "http://www.cplusplus.com/1";
    string url2 = "http://www.cplusplus.com/2";
    string url3 = "http://www.cplusplus.com/3";
    string url4 = "http://www.cplusplus.com/4";
    string url5 = "http://www.cplusplus.com/5";
    string url6 = "http://www.cplusplus.com/6";

    bf.Set(url1);
    bf.Set(url2);
    bf.Set(url3);
    bf.Set(url4);
    bf.Set(url5);
    bf.Set("bitset");
    bf.Set("peter");
    bf.Set("pxxxxxxxxxx");

    cout << "url1?" << bf.Test(url1) << endl;
    cout << "url2?" << bf.Test(url2) << endl;
    cout << "url3?" << bf.Test(url3) << endl;
    cout << "url4?" << bf.Test(url4) << endl;
    cout << "url5" << bf.Test(url5) << endl;
    cout << "url6?" << bf.Test(url6) << endl;
    cout << "peter?" << bf.Test("peter") << endl;
    cout << "peter son?" << bf.Test("peter son") << endl;
}

BloomFilter布隆過濾器

第三百五十八節，Python分布式爬蟲打造搜索引擎Scrapy精講—將bloomfilter(布隆過濾器)集成到scrapy-redis中

BloomFilter(布隆過濾器)

BloomFilter布隆過濾器的使用

BloomFilter(布隆過濾器)原理和python支援庫

Java基礎知識總結--BloomFilter(布隆過濾器)

BloomFilter布隆過濾器

BloomFilter布隆過濾器的java實現

BloomFilter(布隆過濾器)的C#實現

BloomFilter布隆過濾器使用

BloomFilter 布隆過濾器

【資料結構】點陣圖BitMap與布隆過濾器BloomFilter

java實現去重布隆過濾器(BloomFilter)

網路爬蟲：URL去重策略之布隆過濾器(BloomFilter)的使用

布隆過濾器（BloomFilter）

【淺析】|白話布隆過濾器BloomFilter

Bloom Filter布隆過濾器

Bloom filter(布隆過濾器)概念與原理

使用MR編程hbase和hbase調優-布隆過濾器

布隆過濾器的方式解決緩存穿透問題

布隆過濾器之Python+Redis

BloomFilter布隆過濾器

相關推薦