Redis的rdb格式學習

rdb格式背景

在redis中，rdb格式是經過壓縮之後，保存redis的數據的一種格式，該格式主要就是通過一定的壓縮算法，將redis服務端中的內存數據落盤到文件中，本文主要就是分析一下該協議的具體格式，並解析一下。

rdb格式

rdb的格式的詳細格式可參考官網，其中最主要的格式如下所示，

----------------------------# RDB is a binary format. There are no new lines or spaces in the file.
52 45 44 49 53              # Magic String "REDIS"
30 30 30 37                 # 4 digit ASCCII RDB Version Number. In this case, version = "0007" = 7
----------------------------
FE 00                       # FE = code that indicates database selector. db number = 00
----------------------------# Key-Value pair starts
FD $unsigned int            # FD indicates "expiry time in seconds". After that, expiry time is read as a 4 byte unsigned int
$value-type                 # 1 byte flag indicating the type of value - set, map, sorted set etc.
$string-encoded-key         # The key, encoded as a redis string
$encoded-value              # The value. Encoding depends on $value-type
----------------------------
FC $unsigned long           # FC indicates "expiry time in ms". After that, expiry time is read as a 8 byte unsigned long
$value-type                 # 1 byte flag indicating the type of value - set, map, sorted set etc.
$string-encoded-key         # The key, encoded as a redis string
$encoded-value              # The value. Encoding depends on $value-type
----------------------------
$value-type                 # This key value pair doesn't have an expiry. $value_type guaranteed != to FD, FC, FE and FF
$string-encoded-key
$encoded-value
----------------------------
FE $length-encoding         # Previous db ends, next db starts. Database number read using length encoding.
----------------------------
...                         # Key value pairs for this database, additonal database
                            
FF                          ## End of RDB file indicator
8 byte checksum             ## CRC 64 checksum of the entire file.

看了這個圖之後，大致知道了rdb格式的過程，

首先，寫入redis，然後接下來四個字節就是rdb的版本號。
如果讀到的是FE，則是數據庫編號。
解析數據庫中每一個的key-value，每一對的key-value的形式可能有三種形式，第一，沒有過期時間的就時間是value-type，然後再就是編碼的key，接着就是編碼的value，第二，有過期時間爲秒的，過期時間爲秒的則是頭四位是時間，接下來是value-type，然後是key，最後是value，第三，有過期時間爲毫秒的，過期時間爲頭八位是時間，接下來是value-type，然後是key，最後是value。
如果還有其他數據庫則繼續重複從第二步開始。
最後讀到的是FF，這標緻這rdb文件結束，最後八位就是一個checksum的標識符。

看了文檔之後，大致給的說明是這樣，那我們深入查看一下redis是如何寫rdb文件的。

redis寫rdb文件的過程

首先查看redis源碼中的rdb.c文件

int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) {
    dictIterator *di = NULL;
    dictEntry *de;
    char magic[10];
    int j;
    uint64_t cksum;
    size_t processed = 0;

    if (server.rdb_checksum)                                                // 檢查是否配置了rdb_checksum 這個功能在redis5之後纔有
        rdb->update_cksum = rioGenericUpdateChecksum;
    snprintf(magic,sizeof(magic),"REDIS%04d",RDB_VERSION);                  // 編寫魔術 redis和版本號 
    if (rdbWriteRaw(rdb,magic,9) == -1) goto werr;                          // 寫入rdb文件中
    if (rdbSaveInfoAuxFields(rdb,flags,rsi) == -1) goto werr;               // 添加aux字段值，該添加內容沒有再rdb文檔中說明

    for (j = 0; j < server.dbnum; j++) {                                    // 編寫每個數據庫的內容到rdb文件中
        redisDb *db = server.db+j;
        dict *d = db->dict;                                                 // 如果數據庫大小爲0， 則跳過該數據庫
        if (dictSize(d) == 0) continue;
        di = dictGetSafeIterator(d);                                        // 獲取迭代器

        /* Write the SELECT DB opcode */
        if (rdbSaveType(rdb,RDB_OPCODE_SELECTDB) == -1) goto werr;          // 想rdb中寫入數據庫的標識
        if (rdbSaveLen(rdb,j) == -1) goto werr;                             // 並寫入當前數據庫

        /* Write the RESIZE DB opcode. We trim the size to UINT32_MAX, which
         * is currently the largest type we are able to represent in RDB sizes.
         * However this does not limit the actual size of the DB to load since
         * these sizes are just hints to resize the hash tables. */
        uint64_t db_size, expires_size;
        db_size = dictSize(db->dict);
        expires_size = dictSize(db->expires);
        if (rdbSaveType(rdb,RDB_OPCODE_RESIZEDB) == -1) goto werr;          // 寫入resize db 的標誌位
        if (rdbSaveLen(rdb,db_size) == -1) goto werr;                       // 寫入大小
        if (rdbSaveLen(rdb,expires_size) == -1) goto werr;                  // 寫入過期的大小

        /* Iterate this DB writing every entry */
        while((de = dictNext(di)) != NULL) {                                // 遍歷每一個數據庫
            sds keystr = dictGetKey(de);                                    // 獲取key的string 
            robj key, *o = dictGetVal(de);                                  // 獲取value
            long long expire;

            initStaticStringObject(key,keystr);
            expire = getExpire(db,&key);                                    // 獲取過期時間，如果沒有則不會寫入rdb文件中
            if (rdbSaveKeyValuePair(rdb,&key,o,expire) == -1) goto werr;    // 寫入過期時間

            /* When this RDB is produced as part of an AOF rewrite, move
             * accumulated diff from parent to child while rewriting in
             * order to have a smaller final write. */
            if (flags & RDB_SAVE_AOF_PREAMBLE &&
                rdb->processed_bytes > processed+AOF_READ_DIFF_INTERVAL_BYTES)      // 當在重寫aof文件的時候，移動不同的到文件中，以達到寫的rdb文件最爲近似
            {
                processed = rdb->processed_bytes;
                aofReadDiffFromParent();
            }
        }
        dictReleaseIterator(di);                                            // 釋放該迭代器
        di = NULL; /* So that we don't release it again on error. */
    }

    /* If we are storing the replication information on disk, persist
     * the script cache as well: on successful PSYNC after a restart, we need
     * to be able to process any EVALSHA inside the replication backlog the
     * master will send us. */
    if (rsi && dictSize(server.lua_scripts)) {
        di = dictGetIterator(server.lua_scripts);
        while((de = dictNext(di)) != NULL) {
            robj *body = dictGetVal(de);
            if (rdbSaveAuxField(rdb,"lua",3,body->ptr,sdslen(body->ptr)) == -1)   // 寫入lua_scripts 相關的內容
                goto werr;
        }
        dictReleaseIterator(di);
        di = NULL; /* So that we don't release it again on error. */
    }

    /* EOF opcode */
    if (rdbSaveType(rdb,RDB_OPCODE_EOF) == -1) goto werr;          // 寫入EOF到最後一位

    /* CRC64 checksum. It will be zero if checksum computation is disabled, the
     * loading code skips the check in this case. */
    cksum = rdb->cksum;                                            // 獲取cksum
    memrev64ifbe(&cksum);
    if (rioWrite(rdb,&cksum,8) == 0) goto werr;                    // 寫入cksum值，八位 再在導入的時候會檢查該值
    return C_OK;

werr:
    if (error) *error = errno;
    if (di) dictReleaseIterator(di);
    return C_ERR;
}

從rdbSaveRio的函數執行流程來看，跟文檔描述的基本吻合，我們着重先查看一下rdbSaveLen兩個函數；

int rdbSaveLen(rio *rdb, uint64_t len) {                // 保存長度
    unsigned char buf[2];
    size_t nwritten;

    if (len < (1<<6)) {                                     // 查看長度沒有超過了64
        /* Save a 6 bit len */
        buf[0] = (len&0xFF)|(RDB_6BITLEN<<6);               // 使用6位來保存該長度
        if (rdbWriteRaw(rdb,buf,1) == -1) return -1;
        nwritten = 1;
    } else if (len < (1<<14)) {                             // 查看長度是否大於64小於16384
        /* Save a 14 bit len */
        buf[0] = ((len>>8)&0xFF)|(RDB_14BITLEN<<6);         // 使用14位來保存長度信息
        buf[1] = len&0xFF;
        if (rdbWriteRaw(rdb,buf,2) == -1) return -1;
        nwritten = 2;
    } else if (len <= UINT32_MAX) {                         // 如果長度超過16384 小於32位 使用32位保存長度
        /* Save a 32 bit len */
        buf[0] = RDB_32BITLEN;
        if (rdbWriteRaw(rdb,buf,1) == -1) return -1;
        uint32_t len32 = htonl(len);
        if (rdbWriteRaw(rdb,&len32,4) == -1) return -1;
        nwritten = 1+4;
    } else {
        /* Save a 64 bit len */
        buf[0] = RDB_64BITLEN;                              // 使用64位來保存長度
        if (rdbWriteRaw(rdb,buf,1) == -1) return -1;
        len = htonu64(len);
        if (rdbWriteRaw(rdb,&len,8) == -1) return -1;
        nwritten = 1+8;
    }
    return nwritten;
}

從該函數保存長度來看，通過不同的長度選擇不同的位數來保存該長度信息從而優化rdb減少rdb文件的大小，接下來我們着重查看一下rdbSaveRawString函數，該函數主要就是在保存完長度之後，保存接下來的string的內容；

ssize_t rdbSaveRawString(rio *rdb, unsigned char *s, size_t len) {
    int enclen;
    ssize_t n, nwritten = 0;

    /* Try integer encoding */
    if (len <= 11) {                                                            // 如果長度小於11則寫整形
        unsigned char buf[5];
        if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {           // 保存整形編碼
            if (rdbWriteRaw(rdb,buf,enclen) == -1) return -1;                   // 寫入對應的數據
            return enclen;
        }
    }

    /* Try LZF compression - under 20 bytes it's unable to compress even
     * aaaaaaaaaaaaaaaaaa so skip it */
    if (server.rdb_compression && len > 20) {                                   // 如果長度大於20 並且配置了可壓縮
        n = rdbSaveLzfStringObject(rdb,s,len);                                  // 使用lzf壓縮算法壓縮
        if (n == -1) return -1;
        if (n > 0) return n;
        /* Return value of 0 means data can't be compressed, save the old way */
    }

    /* Store verbatim */
    if ((n = rdbSaveLen(rdb,len)) == -1) return -1;                             // 11 到20之間則直接保存
    nwritten += n;
    if (len > 0) {
        if (rdbWriteRaw(rdb,s,len) == -1) return -1;                            // 寫入數據 
        nwritten += len;
    }
    return nwritten;
}

從該函數的保存方式來看，保存的格式分成了三種小於11大小則嘗試整形方式編碼，如果超過20大小則使用lzf方式壓縮，在11到20之間則直接保存。

在rdb保存的過程中，保存key-value類型的處理函數是rdbSaveKeyValuePair，

int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, long long expiretime) {
    int savelru = server.maxmemory_policy & MAXMEMORY_FLAG_LRU;             // 是否是lru格式
    int savelfu = server.maxmemory_policy & MAXMEMORY_FLAG_LFU;             // 是否是lfu格式

    /* Save the expire time */
    if (expiretime != -1) {                                                 // 是否有過期時間
        if (rdbSaveType(rdb,RDB_OPCODE_EXPIRETIME_MS) == -1) return -1;     // 保存過期時間類型
        if (rdbSaveMillisecondTime(rdb,expiretime) == -1) return -1;        // 保存過期時間
    }

    /* Save the LRU info. */
    if (savelru) {
        uint64_t idletime = estimateObjectIdleTime(val);
        idletime /= 1000; /* Using seconds is enough and requires less space.*/
        if (rdbSaveType(rdb,RDB_OPCODE_IDLE) == -1) return -1;
        if (rdbSaveLen(rdb,idletime) == -1) return -1;
    }

    /* Save the LFU info. */
    if (savelfu) {
        uint8_t buf[1];
        buf[0] = LFUDecrAndReturn(val);
        /* We can encode this in exactly two bytes: the opcode and an 8
         * bit counter, since the frequency is logarithmic with a 0-255 range.
         * Note that we do not store the halving time because to reset it
         * a single time when loading does not affect the frequency much. */
        if (rdbSaveType(rdb,RDB_OPCODE_FREQ) == -1) return -1;
        if (rdbWriteRaw(rdb,buf,1) == -1) return -1;
    }

    /* Save type, key, value */
    if (rdbSaveObjectType(rdb,val) == -1) return -1;                        // 保存val的類型
    if (rdbSaveStringObject(rdb,key) == -1) return -1;                      // 保存key
    if (rdbSaveObject(rdb,val) == -1) return -1;                        // 保存val的內容
    return 1;
}

其中最主要的就是rdbSaveObject，該函數就是保存了redis的各種的對應的數據結構的數據，大家有興趣可以自行翻閱一下該函數的流程。

總結

Python相關的rdb解析工具現在用的比較多的是rdbtools，查看了協議格式可以看出，格式的解析確實相對有些繁瑣並沒有redis協議那麼容易去實現，大家可看一下rdbtools有關協議解析的核心代碼，位於rdbtools/parser.py中，主要的解析邏輯都位於其中，跟redis寫rdb格式的邏輯對接起來就可以大致知道協議的生成與解析。

Redis的rdb格式學習

rdb格式背景

rdb格式

redis寫rdb文件的過程

總結

Redis的rdb格式學習

遍歷百萬級Redis的鍵值的大結局

租約-代碼實踐

golang源碼分析：調度器chan調度

兩階段提交實際項目V1

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結