1. 程式人生 > 其它 >c++實現布隆過濾器

c++實現布隆過濾器

技術標籤:redis學習系列c++hash

hashfuntion.h(常用字串Hash函式

unsigned int SDBMHash(const char *str);
unsigned int RSHash(const char *str);
unsigned int JSHash(const char *str);
unsigned int PJWHash(const char *str);
unsigned int APHash(const char *str);
unsigned int DJBHash(const char *str);
unsigned int ELFHash(const char *str);
unsigned int BKDRHash(const char *str);

hashfuntion.cpp

#include "hashfunction.h"

unsigned int SDBMHash(const char *str)
{
    unsigned int hash = 0;
    while (*str)
    {
        // equivalent to: hash = 65599*hash + (*str++);
        hash = (*str++) + (hash << 6) + (hash << 16) - hash;
    }
    return (hash & 0x7FFFFFFF);
}

//RS Hash Function
unsigned int RSHash(const char *str)
{
    unsigned int b = 378551;
    unsigned int a = 63689;
    unsigned int hash = 0;
    while (*str)
    {
        hash = hash * a + (*str++);
        a *= b;
    }
    return (hash & 0x7FFFFFFF);
}

//JS Hash Function
unsigned int JSHash(const char *str)
{
    unsigned int hash = 1315423911;
    while (*str)
    {
        hash ^= ((hash << 5) + (*str++) + (hash >> 2));
    }
    return (hash & 0x7FFFFFFF);
}

//P. J. Weinberger Hash Function
unsigned int PJWHash(const char *str)
{
    unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8);
    unsigned int ThreeQuarters    = (unsigned int)((BitsInUnignedInt  * 3) / 4);
    unsigned int OneEighth        = (unsigned int)(BitsInUnignedInt / 8);
    unsigned int HighBits         = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth);
    unsigned int hash             = 0;
    unsigned int test             = 0;
    while (*str)
    {
        hash = (hash << OneEighth) + (*str++);
        if ((test = hash & HighBits) != 0)
        {
            hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
        }
    }
    return (hash & 0x7FFFFFFF);
}

//AP Hash Function
unsigned int APHash(const char *str)
{
    unsigned int hash = 0;
    int i;
    for (i=0; *str; i++)
    {
        if ((i & 1) == 0)
        {
            hash ^= ((hash << 7) ^ (*str++) ^ (hash >> 3));
        }
        else
        {
            hash ^= (~((hash << 11) ^ (*str++) ^ (hash >> 5)));
        }
    }
    return (hash & 0x7FFFFFFF);
}

//DJB Hash Function
unsigned int DJBHash(const char *str)
{
    unsigned int hash = 5381;
    while (*str)
    {
        hash += (hash << 5) + (*str++);
    }
    return (hash & 0x7FFFFFFF);
}

// ELF Hash Function
unsigned int ELFHash(const char *str)
{
    unsigned int hash = 0;
    unsigned int x    = 0;
    while (*str)
    {
        hash = (hash << 4) + (*str++);
        if ((x = hash & 0xF0000000L) != 0)
        {
            hash ^= (x >> 24);
            hash &= ~x;
        }
    }
    return (hash & 0x7FFFFFFF);
}

//BKDR Hash Function
unsigned int BKDRHash(const char *str)
{
    unsigned int seed = 131; // 31 131 1313 13131 131313 etc..
    unsigned int hash = 0;
    while (*str)
    {
        hash = hash * seed + (*str++);
    }
    return (hash & 0x7FFFFFFF);
}

bloomfilter.h

#ifndef BLOOMFILTER_H
#define BLOOMFILTER_H

#include <stdio.h>
#include <iostream>
#include <string>
#include <math.h>
#include <vector>
#include <fstream>
#include "hashfunction.h"

#define INT_SIZE 32
#define BUFFER_SIZE 1024

class BloomFilter{
public:
    //樣本個數n,期望的失誤率p,傳入樣本的文件路徑
    BloomFilter(double errRate,int sampleNum);
    ~BloomFilter();
    void filterInit();                      //初始化布隆過濾器
    void arrayGenerate(std::string path);   //開啟path路徑的文件,計算每一行樣本到array中
    bool isContain(const char* str);        //檢視字串是否在樣本中存在
    void storeArray(std::string path);      //把array儲存在指定路徑檔案path裡
    void restoreArray(std::string path);    //把array從指定路徑檔案path裡恢復載入
    int getHashFunNum();                    //返回需要的雜湊函式的個數k
    int getIntNum();                        //返回需要的記憶體的int數
private:
    int hashtableInit();    //把幾個雜湊函式加入到hastable中
    std::string path;       //傳入樣本的文件路徑
    double errRate;         //樣本失誤率p
    int sampleNum;          //樣本個數n
    int bitNum;             //需要的二進位制位數m
    int intNum;             //需要申請記憶體的int數
    int hashFunNum;         //需要的雜湊函式的個數k,注意計算得到的雜湊函式個數k應該<=hashtable.size();
    int *array;             //記憶體
    std::vector<unsigned int (*)(const char*)> hashtable;    //存放計算字串雜湊值的雜湊函式
};

#endif // BLOOMFILTER_H

bloomfilter.cpp

#include "bloomfilter.h"

BloomFilter::BloomFilter(double errRate,int sampleNum)
{
    this->errRate=errRate;
    this->sampleNum=sampleNum;
    this->bitNum=-((this->sampleNum*log(this->errRate))/(log(2)*log(2)));
    this->hashFunNum=0.7*(this->bitNum/this->sampleNum);
    this->intNum=this->bitNum/INT_SIZE+1;
    array=new int[this->intNum];
}

BloomFilter::~BloomFilter()
{
    delete []array;
}

void BloomFilter::filterInit()
{
    hashtableInit();
    if(this->hashFunNum>(int)hashtable.size())
    {
        std::cout<<"雜湊函式不足,請新增"<<std::endl;
        return;
    }
}

int BloomFilter::hashtableInit()
{
    hashtable.push_back(*PJWHash);
    hashtable.push_back(*JSHash);
    hashtable.push_back(*RSHash);
    hashtable.push_back(*SDBMHash);
    hashtable.push_back(*APHash);
    hashtable.push_back(*DJBHash);
    hashtable.push_back(*BKDRHash);
    hashtable.push_back(*ELFHash);
    return hashtable.size();
}

void BloomFilter::arrayGenerate(std::string path)
{
    int hashval;
    std::fstream fs;
    fs.open(path.c_str(),std::ios::in|std::ios::out);
    if(!fs)
    {
        perror("open file error");
        return;
    }
    fs.seekp(std::ios::beg);
    char buf[BUFFER_SIZE]={0};
    fs.getline(buf,BUFFER_SIZE);
    while(!fs.eof())
    {
        for(int i=0;i!=this->hashFunNum;i++)
        {
            hashval=hashtable[i](buf);
            hashval=hashval%(this->intNum*INT_SIZE);
            this->array[hashval/INT_SIZE]|=(0x1<<(hashval%INT_SIZE));
        }
        fs.getline(buf,BUFFER_SIZE);
    }
    fs.clear();
    fs.close();
}

bool BloomFilter::isContain(const char* str)
{
    int hashval;
    for(int i=0;i!=this->hashFunNum;i++)
    {
        hashval=hashtable[i](str);
        hashval=hashval%(this->intNum*INT_SIZE);
        if(array[hashval/INT_SIZE]&(0x1<<(hashval%INT_SIZE)))
            continue;
        else
            return false;
    }
    return true;
}

void BloomFilter::storeArray(std::string path)
{
    std::fstream fs;
    fs.open(path.c_str(),std::ios::in|std::ios::out);
    if(!fs)
    {
        fs.open(path.c_str(),std::ios::app|std::ios::in|std::ios::out);
        for(int k=0;k<this->intNum;k++)
            fs<<array[k]<<std::endl;
    }
    else
    {
        fs.seekp(std::ios::beg);
        for(int k=0;k<this->intNum;k++)
            fs<<array[k]<<std::endl;
    }
    fs.clear();
    fs.close();
}

void BloomFilter::restoreArray(std::string path)
{
    std::fstream fs;
    fs.open(path.c_str(),std::ios::in|std::ios::out);
    if(!fs)
    {
        perror("open file error");
        return;
    }
    fs.seekp(std::ios::beg);
    char buf[INT_SIZE]={0};
    fs.getline(buf,INT_SIZE);
    int i=0;
    while(!fs.eof())
    {
        sscanf(buf,"%d\n",&this->array[i++]);
        fs.getline(buf,INT_SIZE);
    }
    fs.clear();
    fs.close();
}

int BloomFilter::getHashFunNum()
{
    return this->hashFunNum;
}

int BloomFilter::getIntNum()
{
    return this->intNum;
}

測試:

main.cpp

載入檔案生成array並儲存:

#include "tool/bloomfilter.h"
using namespace std;

int main(int argc, char *argv[])
{
    BloomFilter mybloom(0.01,100);
    mybloom.filterInit();
    mybloom.arrayGenerate("../tool/redlist.txt");
    cout<<"intNum:"<<mybloom.getIntNum()<<",hashFunNum:"<<mybloom.getHashFunNum()<<endl;
    cout<<"www.dubai.com在我的集合中嗎:"<<(mybloom.isContain("www.dubai.com")?"在":"不在")<<endl;
    cout<<"www.qq.com在我的集合中嗎:"<<(mybloom.isContain("www.qq.com")?"在":"不在")<<endl;
    mybloom.storeArray("../tool/bloom_array.txt");
    return 0;
}

從檔案中讀取恢復array:

#include "tool/bloomfilter.h"
using namespace std;

int main(int argc, char *argv[])
{
    BloomFilter mybloom(0.01,100);
    mybloom.filterInit();
    mybloom.restoreArray("../tool/bloom_array.txt");
    cout<<"www.dubai.com在我的集合中嗎:"<<(mybloom.isContain("www.dubai.com")?"在":"不在")<<endl;
    cout<<"www.qq.com在我的集合中嗎:"<<(mybloom.isContain("www.qq.com")?"在":"不在")<<endl;
    return 0;
}