glib庫 hash表實現分析

阿新 • • 發佈：2019-01-22

Hash Table的原理

雜湊表的目的簡單來說是為了實現儲存多個key=>value關係（注意，此處是單項推導，不支援反向查詢），一個比較簡單的模型實現是用一個數組來儲存這些關係，但是在插入資料時，並不在index最小的陣列位置插入，而是直接通過函式算出這個key-value應該儲存的位置，這樣可以避免查詢時遍歷查詢。

一個比較簡單的實現方法是這樣：

typedef struct
{
    T_Val value;
    T_Pos next;
}T_Item;
struct _HashTable
{
    T_Item items[];
    T_Pos hash_list_Headers[];
}T_HashTable;

將具有相同hash值的條目組成一個連結串列，用T_Item::next域表示連結串列的下一個的位置，當插入的時候，首先計算出hash值，以該值作為索引，通過T_HashTable::hash_list_headers，找到該雜湊連結串列的地一個位置T_Pos，然後遍歷該連結串列，找到需要的key對應的value。

這種方法有什麼缺點呢？
1. 連結串列在實體記憶體中不連續，造成cache命中減小
2. 如果用單項鍊表，則刪除節點十分麻煩
3. 效率很低

一群牛人們在glib庫中重新寫了hash演算法的模板，本文在此分析一下他的執行過程

大體演算法：

  gpointer        *keys 
;
  guint           *hashes;
  gpointer        *values;

這三個陣列長度相同，下標為x的三個元素 key[x] hashes[x] values[x]，共同描述這同一個key-value關係。
不會出現當x,y,z任意兩個不想等時，key[x] hashes[y] values[z]，共同描述一個key-value關係。當插入一個
key-value關係時，虛擬碼如下：

find(key)
{
    hash_val = get_hash_by_key( key);
    for ( i = hash_val; HASH_IS_USED( hashes[i] ); i++ )
    {
        if 
 ( HASH_VAL_TOMB == hash[i] )
            continue;
        if ( hash[i] == hash_val && keys[i] == key)
            return values[i];
   }
}

首先計算該key的hash值，以該雜湊值x為起始陣列位置，找到hashes[x]與keys[x]同時匹配時，返回value

先來看一下最重要的資料結構，這個_GHashTable資料結構寫在c檔案中，對外不暴露細節，在glib.h檔案中被
typedef為 GHashTable

struct GHashTable
{
  gint             size;
  gint             mod;
  guint            mask;
  gint             nnodes;
  gint             noccupied;  /* nnodes + tombstones */

  gpointer        *keys;
  guint           *hashes;
  gpointer        *values;

  GHashFunc        hash_func;
  GEqualFunc       key_equal_func;
  gint             ref_count;
#ifndef G_DISABLE_ASSERT
  /*
   * Tracks the structure of the hash table, not its contents: is only
   * incremented when a node is added or removed (is not incremented
   * when the key or data of a node is modified).
   */
  int              version;
#endif
  GDestroyNotify   key_destroy_func;
  GDestroyNotify   value_destroy_func;
};

GHashTable *
g_hash_table_new (GHashFunc  hash_func,
                  GEqualFunc key_equal_func)
{
  return g_hash_table_new_full (hash_func, key_equal_func, NULL, NULL);
}

GHashTable *
g_hash_table_new_full (GHashFunc      hash_func,
                       GEqualFunc     key_equal_func,
                       GDestroyNotify key_destroy_func,
                       GDestroyNotify value_destroy_func)
{
  GHashTable *hash_table;
  hash_table = g_slice_new (GHashTable);
  /*g_slice_new和new是一樣的*/
  g_hash_table_set_shift (hash_table, HASH_TABLE_MIN_SHIFT);
  /*這個函式很有意思，下文著重分析*/
  hash_table->nnodes             = 0;
  /*這個雜湊表中，實際被真正佔用的有多少個*/
  hash_table->noccupied          = 0;
  /*noccupied  = nnodes + tombs
    tomb 墳墓，也就是被刪除了的元素！
    */
  hash_table->hash_func          = hash_func ? hash_func : g_direct_hash;
  hash_table->key_equal_func     = key_equal_func;
  hash_table->ref_count          = 1;
#ifndef G_DISABLE_ASSERT
  hash_table->version            = 0;
#endif
  hash_table->key_destroy_func   = key_destroy_func;
  hash_table->value_destroy_func = value_destroy_func;
  hash_table->keys               = g_new0 (gpointer, hash_table->size);
  hash_table->values             = hash_table->keys;
  /*初始化時，將keys和values指向同一段記憶體，這是為了防止有key與value一直相等的情況，這樣做可以節省記憶體*/
  hash_table->hashes             = g_new0 (guint, hash_table->size);
/*g_new0就是new*/
  return hash_table;
}

插入過程

gboolean
g_hash_table_insert (GHashTable *hash_table,
                     gpointer    key,
                     gpointer    value)
{
  return g_hash_table_insert_internal (hash_table, key, value, FALSE);
}

static gboolean
g_hash_table_insert_internal (GHashTable *hash_table,
                              gpointer    key,
                              gpointer    value,
                              gboolean    keep_new_key)
{
  guint key_hash;
  guint node_index;

  g_return_val_if_fail (hash_table != NULL, FALSE);
  /*hash_table==NULL 直接退出*/
  node_index = g_hash_table_lookup_node (hash_table, key, &key_hash);
  /*這個函式要麼找到命中的node_index，如果沒找到，則返回一個墳墓或者空專案的指標  */
  return g_hash_table_insert_node (hash_table, node_index, key_hash, key, value, keep_new_key, FALSE);
}

static inline guint
g_hash_table_lookup_node (GHashTable    *hash_table,
                          gconstpointer  key,
                          guint         *hash_return)
{
  guint node_index;
  guint node_hash;
  guint hash_value;
  guint first_tombstone = 0;
  gboolean have_tombstone = FALSE;
  guint step = 0;

  g_assert (hash_table->ref_count > 0);

  /*這裡計算雜湊值之後，又對雜湊值做了處理，使之不能大於等於2*/
  /*原因在於 0 表示該hash陣列元素沒有使用，1表示是墳墓*/
  hash_value = hash_table->hash_func (key);
  if (G_UNLIKELY (!HASH_IS_REAL (hash_value)))
    hash_value = 2;

  *hash_return = hash_value;

  node_index = hash_value % hash_table->mod;
  node_hash = hash_table->hashes[node_index];

  /*tomb的hash為1 算是已使用，
  正常的hash >= 2
  未使用hash = 0*/
  /*這裡不會產生死迴圈，因為在要滿之前，就會擴充記憶體，所以總會保證能夠有一個空的雜湊元素的
  g_hash_table_resize*/
  while (!HASH_IS_UNUSED (node_hash))
    {
      if (node_hash == hash_value)
        {
        /*雜湊命中，則比對key*/
          gpointer node_key = hash_table->keys[node_index];

          if (hash_table->key_equal_func)
            {
              if (hash_table->key_equal_func (node_key, key))
                return node_index;
            }
          else if (node_key == key)
            {
              return node_index;
            }
        }
      else if (HASH_IS_TOMBSTONE (node_hash) && !have_tombstone)
        {
        /*如果是一個墳墓，則把下標記錄下來，這樣這個下標就可以作為插入的時候參考用*/
          first_tombstone = node_index;
          have_tombstone = TRUE;
        }
     /*只要不return 或者遇到沒有使用的hash，則一直迴圈*/
      step++;
      node_index += step;
      node_index &= hash_table->mask;
      node_hash = hash_table->hashes[node_index];
    }

  /*走到這裡發現，一直持續迴圈最後還是沒命中，node_index*/
  if (have_tombstone)
    return first_tombstone;

  return node_index;
}

g_hash_table_lookup (GHashTable    *hash_table,
                     gconstpointer  key)
{
  guint node_index;
  guint node_hash;

  g_return_val_if_fail (hash_table != NULL, NULL);

 /*這裡查詢key又呼叫了這個函式*/
  node_index = g_hash_table_lookup_node (hash_table, key, &node_hash);

  return HASH_IS_REAL (hash_table->hashes[node_index])
    ? hash_table->values[node_index]
    : NULL;
}

glib庫 hash表實現分析

Hash Table的原理

glib庫 hash表實現分析

glib庫hash表GHashTable介紹

Hash表分析以及Java實現

跳躍表的分析與實現

數據庫水平切分(拆庫拆表)的實現原理解析(轉)

建庫建表學習心得（知識點誤點分析）

oracle數據庫表實現主鍵自增功能

數據庫多表連接查詢的實現方式

Python之mysql數據庫更新表數據接口實現

【轉】 WordPress數據庫及各表結構分析

MySQL 多表查詢實現分析

【原創】阿裏雲RDS數據庫超大表分區實現

python操作數據庫類。實現建表、插入數據、查詢數據功能

mysql建立觸發器實現相同伺服器下不同庫的表資料同步的錯誤收集

mysql建立TRIGGER觸發器實現相同伺服器下不同庫的表資料同步

mycat1.6實現單庫分表

php 實現hash表

一行Python程式碼實現交叉表資料分析！

26、python資料表透視分析、交叉分析、實現透視表功能

Java實現簡單的資料遷移，從單庫單表到單庫單表（2）

glib庫 hash表實現分析

Hash Table的原理

相關推薦