Python實現布隆過濾器
阿新 • • 發佈:2019-02-17
#_*_coding:utf_8_ import BitVector import os import sys class SimpleHash(): def __init__(self, capability, seed): self.capability = capability self.seed = seed #傳入的value即為url值,ord(value[i])表示第i位字元的ascii碼值 def hash(self, value): ret = 0 for i in range(len(value)): ret += self.seed*ret + ord(value[i]) #最終產生的隨機數是二進位制向量最大下標與隨機數的按位與結果 return (self.capability-1) & ret class BloomFilter(): def __init__(self, BIT_SIZE=1<<25): self.BIT_SIZE = 1 << 25 self.seeds = [5, 7, 11, 13, 31, 37, 61] #建立一個大小為1<<25=33554432位的二進位制向量,分配記憶體 self.bitset = BitVector.BitVector(size=self.BIT_SIZE) self.hashFunc = [] #利用8個素數初始化8個隨機數生成器 for i in range(len(self.seeds)): self.hashFunc.append(SimpleHash(self.BIT_SIZE, self.seeds[i])) def insert(self, value): for f in self.hashFunc: loc = f.hash(value) self.bitset[loc] = 1 def isContaions(self, value): if value == None: return False ret = True for f in self.hashFunc: loc = f.hash(value) #用同樣的隨機數產生方法對比相應位的二進位制值,只要發現有一個不同即返回結果為假 ret = ret & self.bitset[loc] if ret==False: return ret #只有當8個二進位制位都相等時才返回真 return ret def main(): fd = open("urls.txt") bloomfilter = BloomFilter() while True: #url = raw_input() url = fd.readline() if cmp(url, 'exit') == 0: #if 'exit' then break break if bloomfilter.isContaions(url) == False: bloomfilter.insert(url) else: print 'url :%s has exist' % url main()
程式根據網頁連結的個數計算所需最佳空間大小,即二進位制向量的位數,以及所需隨機生成器的雜湊函式個數:
def __init__(self, error_rate, elementNum): #計算所需要的bit數 self.bit_num = -1 * elementNum * cmath.log(error_rate) / (cmath.log(2.0) * cmath.log(2.0)) #四位元組對齊 self.bit_num = self.align_4byte(self.bit_num.real) #分配記憶體 self.bit_array = BitVector(size=self.bit_num) #計算hash函式個數 self.hash_num = cmath.log(2) * self.bit_num / elementNum self.hash_num = self.hash_num.real #向上取整 self.hash_num = int(self.hash_num) + 1 #產生hash函式種子 self.hash_seeds = self.generate_hashseeds(self.hash_num)
生成指定的前n位素數:
def is_prime(n): if n == 9: return False for i in range(3,int(n**0.5)): if n%i == 0: return False return True def find_prime( n ): #找到從5開始的n個素數,使用素數篩法 prime = [] i = 5 while len(prime) != n: flag = False for j in prime: if i % j == 0: flag = True i += 1 break if flag: #如果能被素數整除就跳過一輪迴圈 continue if is_prime(i): prime.append(i) i += 1 return prime