BloomFilter布隆過濾器的使用
阿新 • • 發佈:2018-11-26
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>23.0</version>
</dependency>
public class TestSomething { private static final int capacity = 1000000; private static final int key = 999998; private static BloomFilter<Integer> bloomFilter = BloomFilter.create(Funnels.integerFunnel(), capacity); static { for (int i = 0; i < capacity; i++) { bloomFilter.put(i); } } public static void main(String[] args) { /*返回計算機最精確的時間,單位微妙*/ long start = System.nanoTime(); if (bloomFilter.mightContain(key)) { System.out.println("成功過濾到" + key); } long end = System.nanoTime(); System.out.println("布隆過濾器消耗時間:" + (end - start)); int sum = 0; for (int i = capacity + 20000; i < capacity + 30000; i++) { if (bloomFilter.mightContain(i)) { sum = sum + 1; } } System.out.println("錯判率為:" + sum); } }
布隆過濾器實際上是一個很長的二進位制向量和一系列隨機對映函式。布隆過濾器可以用於檢索一個元素是否在一個集合中。它的優點是空間效率和查詢時間都遠遠超過一般的演算法,缺點是有一定的誤識別率和刪除困難
當一個元素被加入集合時,通過K個雜湊函式將這個元素對映成一個位數組中的K個點,把它們置為1。檢索時,我們只要看看這些點是不是都是1就(大約)知道集合中有沒有它了:如果這些點有任何一個0,則被檢元素一定不在;如果都是1,則被檢元素很可能在。因為存在雜湊衝突導致3%%左右的誤判,即沒有存在的判斷存在,但是在的一定就是在的。
因此BloomFilter最理想的應用場景是在一些複雜的查詢時,在DB上做一層BloomFilter判斷,如果BloomFilter判斷不存在,則沒必要到DB去查了。頂多就是出現誤判時,多到DB查詢一下,而這個概率是很低的。利用redis的高效能以及通過pipeline將多條bit操作命令批量提交,實現了多機BloomFilter的bit資料共享,資料限制512M。
在java中實現存在2個問題:1.OOM 2.持久化
整合redis如下
@Component @Scope("prototype") public class BloomFilter<E> { @Autowired private RedisUtil redisUtil; @Value("${bloomfilter.expireDays}") private long expireDays; private String redisKey = "DEFAULT"; /** * 總長度 */ private int sizeOfBloomFilter; /** * 預估過濾數 */ private int expectedNumberOfFilterElements; /** * 雜湊次數 */ private int numberOfHashFunctions; private final Charset charset = Charset.forName("UTF-8"); private static final String hashName = "MD5"; private static final MessageDigest digestFunction; // The digest method is reused between instances static { MessageDigest tmp; try { tmp = java.security.MessageDigest.getInstance(hashName); } catch (NoSuchAlgorithmException e) { tmp = null; } digestFunction = tmp; } public BloomFilter() { this(0.0001, 600000); } /** * Constructs an empty Bloom filter. * * @param m is the total length of the Bloom filter. * @param n is the expected number of elements the filter will contain. * @param k is the number of hash functions used. */ public BloomFilter(int m, int n, int k) { this.sizeOfBloomFilter = m; this.expectedNumberOfFilterElements = n; this.numberOfHashFunctions = k; } /** * Constructs an empty Bloom filter with a given false positive probability. * The size of bloom filter and the number of hash functions is estimated * to match the false positive probability. * 給定期望的錯誤率,過濾量 * * @param falsePositiveProbability is the desired false positive probability. * @param expectedNumberOfElements is the expected number of elements in the Bloom filter. */ public BloomFilter(double falsePositiveProbability, int expectedNumberOfElements) { // m = ceil(kn/ln2) k = ceil(-ln(f)/ln2) this((int) Math.ceil((int) Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2))) * expectedNumberOfElements / Math.log(2)), expectedNumberOfElements, (int) Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2)))); } /** * Adds all elements from a Collection to the Bloom filter. * * @param c Collection of elements. */ public void addAll(Collection<? extends E> c) { for (E element : c) { add(element); } } /** * Adds an object to the Bloom filter. The output from the object's * toString() method is used as input to the hash functions. * 新增元素 * * @param element is an element to register in the Bloom filter. */ public void add(E element) { add(element.toString().getBytes(charset)); } /** * Adds an array of bytes to the Bloom filter. * * @param bytes array of bytes to add to the Bloom filter. */ public void add(byte[] bytes) { if (redisUtil.get(redisKey) == null) { redisUtil.setBit(redisKey, 0, false); redisUtil.expire(redisKey, expireDays); } int[] hashes = createHashes(bytes, numberOfHashFunctions); for (int hash : hashes) { redisUtil.setBit(redisKey, Math.abs(hash % sizeOfBloomFilter), true); } } /** * Returns true if the element could have been inserted into the Bloom filter. * Use getFalsePositiveProbability() to calculate the probability of this * being correct. * * @param element element to check. * @return true if the element could have been inserted into the Bloom filter. */ public boolean contains(E element) { return contains(element.toString().getBytes(charset)); } /** * Returns true if the array of bytes could have been inserted into the Bloom filter. * Use getFalsePositiveProbability() to calculate the probability of this * being correct. * * @param bytes array of bytes to check. * @return true if the array could have been inserted into the Bloom filter. */ public boolean contains(byte[] bytes) { int[] hashes = createHashes(bytes, numberOfHashFunctions); for (int hash : hashes) { if (!redisUtil.getBit(redisKey, Math.abs(hash % sizeOfBloomFilter))) { return false; } } return true; } /** * Returns true if all the elements of a Collection could have been inserted * into the Bloom filter. Use getFalsePositiveProbability() to calculate the * probability of this being correct. * * @param c elements to check. * @return true if all the elements in c could have been inserted into the Bloom filter. */ public boolean containsAll(Collection<? extends E> c) { for (E element : c) { if (!contains(element)) { return false; } } return true; } /** * Generates digests based on the contents of an array of bytes and splits the result into 4-byte int's and store them in an array. The * digest function is called until the required number of int's are produced. For each call to digest a salt * is prepended to the data. The salt is increased by 1 for each call. * * @param data specifies input data. * @param hashes number of hashes/int's to produce. * @return array of int-sized hashes */ public static int[] createHashes(byte[] data, int hashes) { int[] result = new int[hashes]; int k = 0; byte salt = 0; while (k < hashes) { byte[] digest; synchronized (digestFunction) { digestFunction.update(salt); salt++; digest = digestFunction.digest(data); } for (int i = 0; i < digest.length / 4 && k < hashes; i++) { int h = 0; for (int j = (i * 4); j < (i * 4) + 4; j++) { h <<= 8; h |= ((int) digest[j]) & 0xFF; } result[k] = h; k++; } } return result; } public int getSizeOfBloomFilter() { return this.sizeOfBloomFilter; } public int getExpectedNumberOfElements() { return this.expectedNumberOfFilterElements; } public int getNumberOfHashFunctions() { return this.numberOfHashFunctions; } /** * Compares the contents of two instances to see if they are equal. * * @param obj is the object to compare to. * @return True if the contents of the objects are equal. */ @Override public boolean equals(Object obj) { if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final BloomFilter<E> other = (BloomFilter<E>) obj; if (this.sizeOfBloomFilter != other.sizeOfBloomFilter) { return false; } if (this.expectedNumberOfFilterElements != other.expectedNumberOfFilterElements) { return false; } if (this.numberOfHashFunctions != other.numberOfHashFunctions) { return false; } return true; } public String getRedisKey() { return redisKey; } public void setRedisKey(String redisKey) { this.redisKey = redisKey; } /** * Calculates a hash code for this class. * * @return hash code representing the contents of an instance of this class. */ @Override public int hashCode() { int hash = 7; hash = 61 * hash + this.sizeOfBloomFilter; hash = 61 * hash + this.expectedNumberOfFilterElements; hash = 61 * hash + this.numberOfHashFunctions; return hash; } }
使用場景:黑名單,URL重複檢查,字典糾錯,垃圾郵件,快取穿透: 將資料庫中所有的查詢條件,放到布隆過濾器中。當一個查詢請求來臨的時候,先經過布隆過濾器進行檢查,如果請求存在這個條件中,那麼繼續執行,如果不在,直接丟棄。