c++實現布隆過濾器
阿新 • • 發佈:2021-01-13
hashfuntion.h(常用字串Hash函式)
unsigned int SDBMHash(const char *str); unsigned int RSHash(const char *str); unsigned int JSHash(const char *str); unsigned int PJWHash(const char *str); unsigned int APHash(const char *str); unsigned int DJBHash(const char *str); unsigned int ELFHash(const char *str); unsigned int BKDRHash(const char *str);
hashfuntion.cpp
#include "hashfunction.h" unsigned int SDBMHash(const char *str) { unsigned int hash = 0; while (*str) { // equivalent to: hash = 65599*hash + (*str++); hash = (*str++) + (hash << 6) + (hash << 16) - hash; } return (hash & 0x7FFFFFFF); } //RS Hash Function unsigned int RSHash(const char *str) { unsigned int b = 378551; unsigned int a = 63689; unsigned int hash = 0; while (*str) { hash = hash * a + (*str++); a *= b; } return (hash & 0x7FFFFFFF); } //JS Hash Function unsigned int JSHash(const char *str) { unsigned int hash = 1315423911; while (*str) { hash ^= ((hash << 5) + (*str++) + (hash >> 2)); } return (hash & 0x7FFFFFFF); } //P. J. Weinberger Hash Function unsigned int PJWHash(const char *str) { unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8); unsigned int ThreeQuarters = (unsigned int)((BitsInUnignedInt * 3) / 4); unsigned int OneEighth = (unsigned int)(BitsInUnignedInt / 8); unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth); unsigned int hash = 0; unsigned int test = 0; while (*str) { hash = (hash << OneEighth) + (*str++); if ((test = hash & HighBits) != 0) { hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits)); } } return (hash & 0x7FFFFFFF); } //AP Hash Function unsigned int APHash(const char *str) { unsigned int hash = 0; int i; for (i=0; *str; i++) { if ((i & 1) == 0) { hash ^= ((hash << 7) ^ (*str++) ^ (hash >> 3)); } else { hash ^= (~((hash << 11) ^ (*str++) ^ (hash >> 5))); } } return (hash & 0x7FFFFFFF); } //DJB Hash Function unsigned int DJBHash(const char *str) { unsigned int hash = 5381; while (*str) { hash += (hash << 5) + (*str++); } return (hash & 0x7FFFFFFF); } // ELF Hash Function unsigned int ELFHash(const char *str) { unsigned int hash = 0; unsigned int x = 0; while (*str) { hash = (hash << 4) + (*str++); if ((x = hash & 0xF0000000L) != 0) { hash ^= (x >> 24); hash &= ~x; } } return (hash & 0x7FFFFFFF); } //BKDR Hash Function unsigned int BKDRHash(const char *str) { unsigned int seed = 131; // 31 131 1313 13131 131313 etc.. unsigned int hash = 0; while (*str) { hash = hash * seed + (*str++); } return (hash & 0x7FFFFFFF); }
bloomfilter.h
#ifndef BLOOMFILTER_H #define BLOOMFILTER_H #include <stdio.h> #include <iostream> #include <string> #include <math.h> #include <vector> #include <fstream> #include "hashfunction.h" #define INT_SIZE 32 #define BUFFER_SIZE 1024 class BloomFilter{ public: //樣本個數n,期望的失誤率p,傳入樣本的文件路徑 BloomFilter(double errRate,int sampleNum); ~BloomFilter(); void filterInit(); //初始化布隆過濾器 void arrayGenerate(std::string path); //開啟path路徑的文件,計算每一行樣本到array中 bool isContain(const char* str); //檢視字串是否在樣本中存在 void storeArray(std::string path); //把array儲存在指定路徑檔案path裡 void restoreArray(std::string path); //把array從指定路徑檔案path裡恢復載入 int getHashFunNum(); //返回需要的雜湊函式的個數k int getIntNum(); //返回需要的記憶體的int數 private: int hashtableInit(); //把幾個雜湊函式加入到hastable中 std::string path; //傳入樣本的文件路徑 double errRate; //樣本失誤率p int sampleNum; //樣本個數n int bitNum; //需要的二進位制位數m int intNum; //需要申請記憶體的int數 int hashFunNum; //需要的雜湊函式的個數k,注意計算得到的雜湊函式個數k應該<=hashtable.size(); int *array; //記憶體 std::vector<unsigned int (*)(const char*)> hashtable; //存放計算字串雜湊值的雜湊函式 }; #endif // BLOOMFILTER_H
bloomfilter.cpp
#include "bloomfilter.h"
BloomFilter::BloomFilter(double errRate,int sampleNum)
{
this->errRate=errRate;
this->sampleNum=sampleNum;
this->bitNum=-((this->sampleNum*log(this->errRate))/(log(2)*log(2)));
this->hashFunNum=0.7*(this->bitNum/this->sampleNum);
this->intNum=this->bitNum/INT_SIZE+1;
array=new int[this->intNum];
}
BloomFilter::~BloomFilter()
{
delete []array;
}
void BloomFilter::filterInit()
{
hashtableInit();
if(this->hashFunNum>(int)hashtable.size())
{
std::cout<<"雜湊函式不足,請新增"<<std::endl;
return;
}
}
int BloomFilter::hashtableInit()
{
hashtable.push_back(*PJWHash);
hashtable.push_back(*JSHash);
hashtable.push_back(*RSHash);
hashtable.push_back(*SDBMHash);
hashtable.push_back(*APHash);
hashtable.push_back(*DJBHash);
hashtable.push_back(*BKDRHash);
hashtable.push_back(*ELFHash);
return hashtable.size();
}
void BloomFilter::arrayGenerate(std::string path)
{
int hashval;
std::fstream fs;
fs.open(path.c_str(),std::ios::in|std::ios::out);
if(!fs)
{
perror("open file error");
return;
}
fs.seekp(std::ios::beg);
char buf[BUFFER_SIZE]={0};
fs.getline(buf,BUFFER_SIZE);
while(!fs.eof())
{
for(int i=0;i!=this->hashFunNum;i++)
{
hashval=hashtable[i](buf);
hashval=hashval%(this->intNum*INT_SIZE);
this->array[hashval/INT_SIZE]|=(0x1<<(hashval%INT_SIZE));
}
fs.getline(buf,BUFFER_SIZE);
}
fs.clear();
fs.close();
}
bool BloomFilter::isContain(const char* str)
{
int hashval;
for(int i=0;i!=this->hashFunNum;i++)
{
hashval=hashtable[i](str);
hashval=hashval%(this->intNum*INT_SIZE);
if(array[hashval/INT_SIZE]&(0x1<<(hashval%INT_SIZE)))
continue;
else
return false;
}
return true;
}
void BloomFilter::storeArray(std::string path)
{
std::fstream fs;
fs.open(path.c_str(),std::ios::in|std::ios::out);
if(!fs)
{
fs.open(path.c_str(),std::ios::app|std::ios::in|std::ios::out);
for(int k=0;k<this->intNum;k++)
fs<<array[k]<<std::endl;
}
else
{
fs.seekp(std::ios::beg);
for(int k=0;k<this->intNum;k++)
fs<<array[k]<<std::endl;
}
fs.clear();
fs.close();
}
void BloomFilter::restoreArray(std::string path)
{
std::fstream fs;
fs.open(path.c_str(),std::ios::in|std::ios::out);
if(!fs)
{
perror("open file error");
return;
}
fs.seekp(std::ios::beg);
char buf[INT_SIZE]={0};
fs.getline(buf,INT_SIZE);
int i=0;
while(!fs.eof())
{
sscanf(buf,"%d\n",&this->array[i++]);
fs.getline(buf,INT_SIZE);
}
fs.clear();
fs.close();
}
int BloomFilter::getHashFunNum()
{
return this->hashFunNum;
}
int BloomFilter::getIntNum()
{
return this->intNum;
}
測試:
main.cpp
載入檔案生成array並儲存:
#include "tool/bloomfilter.h"
using namespace std;
int main(int argc, char *argv[])
{
BloomFilter mybloom(0.01,100);
mybloom.filterInit();
mybloom.arrayGenerate("../tool/redlist.txt");
cout<<"intNum:"<<mybloom.getIntNum()<<",hashFunNum:"<<mybloom.getHashFunNum()<<endl;
cout<<"www.dubai.com在我的集合中嗎:"<<(mybloom.isContain("www.dubai.com")?"在":"不在")<<endl;
cout<<"www.qq.com在我的集合中嗎:"<<(mybloom.isContain("www.qq.com")?"在":"不在")<<endl;
mybloom.storeArray("../tool/bloom_array.txt");
return 0;
}
從檔案中讀取恢復array:
#include "tool/bloomfilter.h"
using namespace std;
int main(int argc, char *argv[])
{
BloomFilter mybloom(0.01,100);
mybloom.filterInit();
mybloom.restoreArray("../tool/bloom_array.txt");
cout<<"www.dubai.com在我的集合中嗎:"<<(mybloom.isContain("www.dubai.com")?"在":"不在")<<endl;
cout<<"www.qq.com在我的集合中嗎:"<<(mybloom.isContain("www.qq.com")?"在":"不在")<<endl;
return 0;
}