Redis的字典(dict)rehash過程源碼解析

Redis的內存存儲結構是個大的字典存儲，也就是我們通常說的哈希表。Redis小到可以存儲幾萬記錄的CACHE,大到可以存儲幾千萬甚至上億的記錄（看內存而定），這充分說明Redis作爲緩衝的強大。Redis的核心數據結構就是字典(dict),dict在數據量不斷增大的過程中，會遇到HASH(key)碰撞的問題，如果DICT不夠大，碰撞的概率增大，這樣單個hash 桶存儲的元素會越來愈多，查詢效率就會變慢。如果數據量從幾千萬變成幾萬，不斷減小的過程，DICT內存卻會造成不必要的浪費。Redis的dict在設計的過程中充分考慮了dict自動擴大和收縮，實現了一個稱之爲rehash的過程。使dict出發rehash的條件有兩個:

1）總的元素個數除 DICT桶的個數得到每個桶平均存儲的元素個數(pre_num),如果 pre_num > dict_force_resize_ratio,就會觸發dict 擴大操作。dict_force_resize_ratio = 5。

2）在總元素 * 10 < 桶的個數，也就是,填充率必須<10%, DICT便會進行收縮，讓total / bk_num 接近 1:1。

dict rehash擴大流程:

源代碼函數調用和解析：

dictAddRaw->_dictKeyIndex->_dictExpandIfNeeded->dictExpand，這個函數調用關係是需要擴大dict的調用關係，
_dictKeyIndex函數代碼：

static int _dictKeyIndex(dict *d, const void *key)
{
    unsigned int h, idx, table;
    dictEntry *he;

    // 如果有需要，對字典進行擴展
    if (_dictExpandIfNeeded(d) == DICT_ERR)
        return -1;

    // 計算 key 的哈希值
    h = dictHashKey(d, key);

    // 在兩個哈希表中進行查找給定 key
    for (table = 0; table <= 1; table++) {

        // 根據哈希值和哈希表的 sizemask 
        // 計算出 key 可能出現在 table 數組中的哪個索引
        idx = h & d->ht[table].sizemask;

        // 在節點鏈表裏查找給定 key
        // 因爲鏈表的元素數量通常爲 1 或者是一個很小的比率
        // 所以可以將這個操作看作 O(1) 來處理
        he = d->ht[table].table[idx];
        while(he) {
            // key 已經存在
            if (dictCompareKeys(d, key, he->key))
                return -1;

            he = he->next;
        }

        // 第一次進行運行到這裏時，說明已經查找完 d->ht[0] 了
        // 這時如果哈希表不在 rehash 當中，就沒有必要查找 d->ht[1]
        if (!dictIsRehashing(d)) break;
    }

    return idx;
}

_dictExpandIfNeeded函數代碼解析:

static int _dictExpandIfNeeded(dict *d)
{
    // 已經在漸進式 rehash 當中，直接返回
    if (dictIsRehashing(d)) return DICT_OK;

    // 如果哈希表爲空，那麼將它擴展爲初始大小
    // O(N)
    if (d->ht[0].size == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE);

    // 如果哈希表的已用節點數 >= 哈希表的大小，
    // 並且以下條件任一個爲真：
    //   1) dict_can_resize 爲真
    //   2) 已用節點數除以哈希表大小之比大於 
    //      dict_force_resize_ratio
    // 那麼調用 dictExpand 對哈希表進行擴展
    // 擴展的體積至少爲已使用節點數的兩倍
    // O(N)
    if (d->ht[0].used >= d->ht[0].size &&
        (dict_can_resize ||
         d->ht[0].used/d->ht[0].size > dict_force_resize_ratio))
    {
        return dictExpand(d, d->ht[0].used*2);
    }

    return DICT_OK;
}

dict rehash縮小流程:

源代碼函數調用和解析：

serverCron->tryResizeHashTables->dictResize->dictExpand

serverCron函數是個心跳函數,調用tryResizeHashTables段爲:

int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
    ....
    if (server.rdb_child_pid == -1 && server.aof_child_pid == -1) {
        // 將哈希表的比率維持在 1:1 附近
        tryResizeHashTables();
        if (server.activerehashing) incrementallyRehash(); //進行rehash動作
    }
    ....
}

tryResizeHashTables函數代碼分析:

void tryResizeHashTables(void) {
    int j;

    for (j = 0; j < server.dbnum; j++) {

        // 縮小鍵空間字典
        if (htNeedsResize(server.db[j].dict))
            dictResize(server.db[j].dict);

        // 縮小過期時間字典
        if (htNeedsResize(server.db[j].expires))
            dictResize(server.db[j].expires);
    }
}

htNeedsResize函數是判斷是否可以需要進行dict縮小的條件判斷,填充率必須>10%，否則會進行縮小,具體代碼如下：

int htNeedsResize(dict *dict) {
    long long size, used;

    // 哈希表大小
    size = dictSlots(dict);

    // 哈希表已用節點數量
    used = dictSize(dict);

    // 當哈希表的大小大於 DICT_HT_INITIAL_SIZE 
    // 並且字典的填充率低於 REDIS_HT_MINFILL 時
    // 返回 1
    return (size && used && size > DICT_HT_INITIAL_SIZE &&
            (used*100/size < REDIS_HT_MINFILL));
}

dictResize函數代碼:

int dictResize(dict *d)
{
    int minimal;

    // 不能在 dict_can_resize 爲假
    // 或者字典正在 rehash 時調用
    if (!dict_can_resize || dictIsRehashing(d)) return DICT_ERR;

    minimal = d->ht[0].used;

    if (minimal < DICT_HT_INITIAL_SIZE)
        minimal = DICT_HT_INITIAL_SIZE;

    return dictExpand(d, minimal);
}

以上兩個過程最終調用了dictExpand函數，這個函數主要是產生一個新的HASH表（dictht），並讓將dict.rehashidx= 0。表示開始進行rehash動作。具體的rehash動作是將ht[0]的數據按照hash隱射的規則重新隱射到 ht[1]上.具體代碼如下:

int dictExpand(dict *d, unsigned long size)
{
    dictht n; /* 被轉移數據的新hash table */
    
    // 計算哈希表的真實大小
    unsigned long realsize = _dictNextPower(size);
    if (dictIsRehashing(d) || d->ht[0].used > size || d->ht[0].size == realsize)
        return DICT_ERR;

    // 創建並初始化新哈希表
    n.size = realsize;
    n.sizemask = realsize-1;
    n.table = zcalloc(realsize*sizeof(dictEntry*));
    n.used = 0;

    // 如果 ht[0] 爲空，那麼這就是一次創建新哈希錶行爲
    // 將新哈希表設置爲 ht[0] ，然後返回
    if (d->ht[0].table == NULL) {
        d->ht[0] = n;
        return DICT_OK;
    }

    /* Prepare a second hash table for incremental rehashing */
    // 如果 ht[0] 不爲空，那麼這就是一次擴展字典的行爲
    // 將新哈希表設置爲 ht[1] ，並打開 rehash 標識
    d->ht[1] = n;
    d->rehashidx = 0;

    return DICT_OK;
}

字典dict的rehashidx被設置成0後，就表示開始rehash動作，在心跳函數執行的過程，會檢查到這個標誌，如果需要rehash,就行進行漸進式rehash動作。函數調用的過程爲:

serverCron->incrementallyRehash->dictRehashMilliseconds->dictRehash

incrementallyRehash函數代碼：

/*
 * 在 Redis Cron 中調用，對數據庫中第一個遇到的、可以進行 rehash 的哈希表
 * 進行 1 毫秒的漸進式 rehash
 */
void incrementallyRehash(void) {
    int j;

    for (j = 0; j < server.dbnum; j++) {
        /* Keys dictionary */
        if (dictIsRehashing(server.db[j].dict)) {
            dictRehashMilliseconds(server.db[j].dict,1);
            break; /* 已經耗盡了指定的CPU毫秒數 */
        }
	...
}

dictRehashMilliseconds函數是按照指定的CPU運算的毫秒數，執行rehash動作，每次一個100個爲單位執行。代碼如下:

/*
 * 在給定毫秒數內，以 100 步爲單位，對字典進行 rehash 。
 */
int dictRehashMilliseconds(dict *d, int ms) {
    long long start = timeInMilliseconds();
    int rehashes = 0;

    while(dictRehash(d,100)) {/*每次100步數據*/
        rehashes += 100;
        if (timeInMilliseconds()-start > ms) break; /*耗時完畢，暫停rehash*/
    }
    return rehashes;
}

/*
 * 執行 N 步漸進式 rehash 。
 *
 * 如果執行之後哈希表還有元素需要 rehash ，那麼返回 1 。
 * 如果哈希表裏面所有元素已經遷移完畢，那麼返回 0 。
 *
 * 每步 rehash 都會移動哈希表數組內某個索引上的整個鏈表節點，
 * 所以從 ht[0] 遷移到 ht[1] 的 key 可能不止一個。
 */
int dictRehash(dict *d, int n) {
    if (!dictIsRehashing(d)) return 0;

    while(n--) {
        dictEntry *de, *nextde;
        // 如果 ht[0] 已經爲空，那麼遷移完畢
        // 用 ht[1] 代替原來的 ht[0]
        if (d->ht[0].used == 0) {

            // 釋放 ht[0] 的哈希表數組
            zfree(d->ht[0].table);

            // 將 ht[0] 指向 ht[1]
            d->ht[0] = d->ht[1];

            // 清空 ht[1] 的指針
            _dictReset(&d->ht[1]);
            // 關閉 rehash 標識
            d->rehashidx = -1;
            // 通知調用者， rehash 完畢
            return 0;
        }
        assert(d->ht[0].size > (unsigned)d->rehashidx);
        // 移動到數組中首個不爲 NULL 鏈表的索引上
        while(d->ht[0].table[d->rehashidx] == NULL) d->rehashidx++;
        // 指向鏈表頭
        de = d->ht[0].table[d->rehashidx];
        // 將鏈表內的所有元素從 ht[0] 遷移到 ht[1]
        // 因爲桶內的元素通常只有一個，或者不多於某個特定比率
        // 所以可以將這個操作看作 O(1)
        while(de) {
            unsigned int h;

            nextde = de->next;

            /* Get the index in the new hash table */
            // 計算元素在 ht[1] 的哈希值
            h = dictHashKey(d, de->key) & d->ht[1].sizemask;

            // 添加節點到 ht[1] ，調整指針
            de->next = d->ht[1].table[h];
            d->ht[1].table[h] = de;

            // 更新計數器
            d->ht[0].used--;
            d->ht[1].used++;

            de = nextde;
        }

        // 設置指針爲 NULL ，方便下次 rehash 時跳過
        d->ht[0].table[d->rehashidx] = NULL;

        // 前進至下一索引
        d->rehashidx++;
    }

    // 通知調用者，還有元素等待 rehash
    return 1;
}

總結，Redis的rehash動作是一個內存管理和數據管理的一個核心操作，由於Redis主要使用單線程做數據管理和消息效應，它的rehash數據遷移過程採用的是漸進式的數據遷移模式，這樣做是爲了防止rehash過程太長堵塞數據處理線程。並沒有採用memcached的多線程遷移模式。關於memcached的rehash過程，以後再做介紹。從redis的rehash過程設計的很巧，也很優雅。在這裏值得注意的是，redis在find數據的時候，是同時查找正在遷移的ht[0]和被遷移的ht[1]。防止遷移過程數據命不中的問題。

zerok775

發佈了39 篇原創文章 · 獲贊 76 · 訪問量 18萬+

私信關注

Redis的字典(dict)rehash過程源碼解析

工作中用到的腳本合集

24-5-18 X

C++高性能服務框架revolver:base結構分析

C++高性能服務框架revolver:同時支持100萬個的定時事件的定時器

linux和windows下UDP發送效率的有趣比較

一種服務器的負載均衡選取算法

關於C函數memcpy的實現細節思考

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結