1. 程式人生 > >Python實現布隆過濾器

Python實現布隆過濾器

#_*_coding:utf_8_
import BitVector
import os
import sys

class SimpleHash():  
    
    def __init__(self, capability, seed):
        self.capability = capability
        self.seed = seed

    #傳入的value即為url值,ord(value[i])表示第i位字元的ascii碼值
    def hash(self, value):
        ret = 0
        for i in range(len(value)):
            ret += self.seed*ret + ord(value[i])
        #最終產生的隨機數是二進位制向量最大下標與隨機數的按位與結果
        return (self.capability-1) & ret    

class BloomFilter():
    
    def __init__(self, BIT_SIZE=1<<25):
        self.BIT_SIZE = 1 << 25
        self.seeds = [5, 7, 11, 13, 31, 37, 61]
        #建立一個大小為1<<25=33554432位的二進位制向量,分配記憶體
    	self.bitset = BitVector.BitVector(size=self.BIT_SIZE)
        self.hashFunc = []
        #利用8個素數初始化8個隨機數生成器
        for i in range(len(self.seeds)):
            self.hashFunc.append(SimpleHash(self.BIT_SIZE, self.seeds[i]))
        
    def insert(self, value):
        for f in self.hashFunc:
            loc = f.hash(value)
            self.bitset[loc] = 1
    def isContaions(self, value):
        if value == None:
            return False
        ret = True
        for f in self.hashFunc:
            loc = f.hash(value)
            #用同樣的隨機數產生方法對比相應位的二進位制值,只要發現有一個不同即返回結果為假
            ret = ret & self.bitset[loc]
            if ret==False:
                return ret
        #只有當8個二進位制位都相等時才返回真
        return ret

def main():
    fd = open("urls.txt")
    bloomfilter = BloomFilter()
    while True:
        #url = raw_input()
        url = fd.readline()
        if cmp(url, 'exit') == 0: #if 'exit' then break
            break
        if bloomfilter.isContaions(url) == False:
            bloomfilter.insert(url)
        else:
            print 'url :%s has exist' % url 
            
main()

程式根據網頁連結的個數計算所需最佳空間大小,即二進位制向量的位數,以及所需隨機生成器的雜湊函式個數:

def __init__(self, error_rate, elementNum):

        #計算所需要的bit數
        self.bit_num = -1 * elementNum * cmath.log(error_rate) / (cmath.log(2.0) * cmath.log(2.0))
        #四位元組對齊
        self.bit_num = self.align_4byte(self.bit_num.real)

        #分配記憶體
        self.bit_array = BitVector(size=self.bit_num)

        #計算hash函式個數
        self.hash_num = cmath.log(2) * self.bit_num / elementNum
        self.hash_num = self.hash_num.real
        #向上取整
        self.hash_num = int(self.hash_num) + 1

        #產生hash函式種子
        self.hash_seeds = self.generate_hashseeds(self.hash_num)

生成指定的前n位素數:

def is_prime(n):

    if n == 9:
        return False
    for i in range(3,int(n**0.5)):
        if n%i == 0:
            return False
    return True

def find_prime( n ): #找到從5開始的n個素數,使用素數篩法
    prime = []
    i = 5
    while len(prime) != n:
        flag = False
        for j in prime:
            if i % j == 0:
                flag = True
                i += 1
                break
        if flag: #如果能被素數整除就跳過一輪迴圈
            continue

        if is_prime(i):
            prime.append(i)
        i += 1

    return prime