數據存在？-‘布隆過濾器’

布隆過濾器是一種能夠在大量數據中判斷數據是否存在的算法。它實際上是一個很長的二進制向量和一系列隨機映射函數。布隆過濾器可以用於檢索一個元素是否在一個集合中。它的優點是空間效率和查詢時間都遠遠超過一般的算法，缺點是有一定的誤識別率和刪除困難。在介紹‘布隆過濾器’之前，先介紹一下‘位圖’的思想：

這裏有這樣一個問題：給40億個沒有排序、不重複的無符號整數，如何快速的判斷一個數據是否在這40億個數據之中？

--對於40億個數據，如果我們將這40億個數據都放入內存中，我們需要多大的存儲空間呢？假設每個數據都是char類型的，這樣消耗多少的空間？如果是int類型呢？或者是更多的數據呢？不難知道對於大量的數據，如果採用將數據放入內存中，這種方式是很不理智的。這裏介紹一種方法—‘位圖’。

位圖：主要算法思想就是充分的利用bit位，假設數據都是int類型，每個int類型都佔32個bit位。將一個int類型數據的32個bit用來表示32個數據是否存在， 0表示不存在，1表示存在（能夠極大地縮小空間）。先計算出數據在哪一個int類型的空間中，然後計算在這個int類型的第幾個bit位上，然後將此位置更改爲1，表明這個位置上存在數據。

下面是‘位圖’的實現：

class BitMap
{
public:
     BitMap(size_t size = 0)     //構造
      :_size(0)
     {
          _a.resize((size >> 5) + 1);       //resize開闢空間（int類型的個數），並進行初始化
     }
     
     void set(size_t x)    //插入數據
     {
          size_t index = x >> 5;     //index表示的是數據存在哪一個int類型的位置上
          size_t num = x % 32;     //num表示數據存在32bit的具體位置
          if (!(_a[index] & (1 << num))) //1<<num表示數據x的位置下標，&結果爲0，表示此位置上沒有數據
          {
               ++_size;
               _a[index] |= (1 << num);    //利用按位或關係將位置更改爲1，表示此位置上現在存在數據
          }
     }
     
     void Reset(size_t x)      //刪除數據
     {
          size_t index = x >> 5;
          size_t num = index % 32;
          _a[index] &= (~(1 << num));
          --_size;
     }
     
     bool Test(size_t x)   //判斷數據是否在40億數據中
     {
          size_t index = x >> 5;
          size_t num = x % 32;
          if (_a[index] & (1 << num))
               return true;
          return false;
     }
     
     size_t size()     //求數據的有效個數
     {
          return _size;
     }
     
     void Resize(size_t size)    //開闢空間
     {
          _a.resize((size >> 5) + 1);
     }
     
protected:
     vector<size_t> _a;
     size_t _size;
};

‘布隆過濾器’也是利用位圖的思想，它有一個m個比特個數的空間，每一個bit位都初始化爲0，通過k種不同的hash函數，每個函數都確定出元素所在的不同位置，將這k個位置的bit位置爲1，則將這個元素添加到m個bit的空間中。當需要對數據進行查找時，將k中hash函數得到的k個位置的bit位進行檢查，若k個位置都爲1，則數據存在，否則數據不存在。布隆過濾器是不允許進行刪除數據的，因爲那樣會將k個位置置爲0，可能會影響其他數據的存在性，從而存在錯誤。

下面是‘布隆過濾器’的實現：

//實現布隆過濾器
template <class K>
//使用搜索到的5種Hash函數
struct _HashFunc1
{
     size_t DJBHash(const char *str)
     {
          if (!*str)
               return 0;
          register size_t hash = 5381;
          while (size_t ch = (size_t)*str++)
          {
               hash += (hash << 5) + ch;
          }
          return hash;
     }
     
     size_t operator()(const K& str)
     {
          return DJBHash(str.c_str());
     }
};

template <class K>
struct _HashFunc2
{
     size_t SDBMHash(const char *str)
     {
          register size_t hash = 0;
          while (size_t ch = (size_t)*str++)
          {
               hash = 65599 * hash + ch;
          }
          return hash;
     }
     
     size_t operator()(const K& str)
     {
          return SDBMHash(str.c_str());
     }
};

template <class K>
struct _HashFunc3
{
     size_t RSHash(const char *str)
     {
          register size_t hash = 0;
          size_t magic = 63689;
          while (size_t ch = (size_t)*str++)
          {
               hash = hash * magic + ch;
               magic *= 378551;
          }
          return hash;
     }
     
     size_t operator()(const K& str)
     {
          return RSHash(str.c_str());
     }
};

template <class K>
struct _HashFunc4
{
     size_t APHash(const char *str)
     {
          register size_t hash = 0;
          size_t ch;
          for (long i = 0; ch = (size_t)*str++; i++)
          {
               if ((i & 1) == 0)
               {
                    hash ^= ((hash << 7) ^ ch ^ (hash >> 3));
               }
               else
               {
                    hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
               }
          }
          return hash;
     }
     
     size_t operator()(const K& str)
     {
          return APHash(str.c_str());
     }
};

template <class K>
struct _HashFunc5
{
     size_t JSHash(const char *str)
     {
          if (!*str)
               return 0;
          register size_t hash = 1315423911;
          while (size_t ch = (size_t)*str++)
          {
               hash ^= ((hash << 5) + ch + (hash >> 2));
          }
          return hash;
     }
     size_t operator()(const K& str)
     {
          return JSHash(str.c_str());
     }
};

size_t GetPrimeSize(size_t size) //求大於等於size的最小素數
{
     static const int _prime = 28;
     static const unsigned long _PrimeList[_prime] =
     {
          53ul, 97ul, 193ul, 389ul, 769ul,
          1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
          49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
          1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
          50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,
          1610612741ul, 3221225473ul, 4294967291ul
     };
     
     for (size_t i = 0; i < _prime; i++)
     {
          if (_PrimeList[i] >= size)
          {
               return _PrimeList[i];
          }
     }
     return _PrimeList[_prime - 1];
}

template <class K = string, 
          class HashFunc1 = _HashFunc1<K>,
          class HashFunc2 = _HashFunc2<K>,
          class HashFunc3 = _HashFunc3<K>,
          class HashFunc4 = _HashFunc4<K>,
          class HashFunc5 = _HashFunc5<K>>
class BloomFilter
{
public:
     BloomFilter(size_t size = 0)    //構造
     {
          _capacity = GetPrimeSize(size);
          _bm.Resize(_capacity);
     }
     
     void set(const K& key)
     {
          size_t index1 = HashFunc1()(key);
          size_t index2 = HashFunc2()(key);
          size_t index3 = HashFunc3()(key);
          size_t index4 = HashFunc4()(key);
          size_t index5 = HashFunc5()(key);
          
          _bm.set((index1) % _capacity);
          _bm.set((index2) % _capacity);
          _bm.set((index3) % _capacity);
          _bm.set((index4) % _capacity);
          _bm.set((index5) % _capacity);
     }
     
     bool Test(const K& key)    //測試數據是否存在
     {
          size_t index1 = HashFunc1()(key);
          if (!_bm.Test((index1) % _capacity))
          {
               return false;
          }
          
          size_t index2 = HashFunc2()(key);
          if (!_bm.Test((index2) % _capacity))
          {
               return false;
          }
          
          size_t index3 = HashFunc3()(key);
          if (!_bm.Test((index3) % _capacity))
          {
               return false;
          }
          
          size_t index4 = HashFunc4()(key);
          if (!_bm.Test((index4) % _capacity))
          {
               return false;
          }
          
          size_t index5 = HashFunc5()(key);
          if (!_bm.Test((index5) % _capacity))
          {
               return false;
          }
          return true;
     }
     
protected:
     BitMap _bm;
     size_t _capacity;
};

本文出自 “無心的執着” 博客，轉載請與作者聯繫！

數據存在？-‘布隆過濾器’

.Net 8.0 下的新RPC，IceRPC之試試的新玩法"打洞"

完美替代postman的軟件

Vue mockjs mock.js

關於遊戲付費的一點想法

我通過CKA和CKS啦！

《最新出爐》系列入門篇-Python+Playwright自動化測試-42-強大的可視化追蹤利器Trace Viewer

大數據怎麼學？對大數據開發領域及崗位的詳細解讀，完整理解大數據開發領域技術體系

廣義表（非線性結構）

linux系統中‘find’的詳細用法

平衡搜索樹—AVLTree

數據結構—各類‘排序算法’實現（下）

數據存在？-‘布隆過濾器’

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結