redis replication主從複製的源碼分析(1)

        對於replication.c的源碼分析,我將會分兩部分介紹主從複製的過程和主從同步的複製《redis replication主

從複製的源碼分析(2)》。本文主要分析slave連接master進行主從複製的過程實現。

     redis-cli通過向從服務器發送slaveof命令,可以使從服務器去複製一個主服務器:

      slaveof <master_ip> <master_port>

     主從複製的詳細的步驟如下:

    1、設置主服務器的地址和端口

    2、建立套接字連接

    3、發送ping命令

    4、身份驗證

    5、發送端口信息

    6、同步

    7、命令傳播

    replicationSetMaster() 設置主服務器的地址和端口,初始化replication狀態

void replicationSetMaster(char *ip, int port) {
    sdsfree(server.masterhost);
    server.masterhost = sdsnew(ip);
    server.masterport = port;
    if (server.master) freeClient(server.master);
    disconnectAllBlockedClients(); /* Clients blocked in master, now slave. */
    disconnectSlaves(); /* Force our slaves to resync with us as well. */
    replicationDiscardCachedMaster(); /* Don't try a PSYNC. */
    freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */
    cancelReplicationHandshake();
    server.repl_state = REPL_STATE_CONNECT;//設置repl_state,準備開始replication
    server.master_repl_offset = 0;//初始化replication的偏移
    server.repl_down_since = 0;
}

    connectWithMaster()建立套接字連接,設置事件回調syncWithMaster()

int connectWithMaster(void) {
    int fd;//創建與master的socket
    fd = anetTcpNonBlockBestEffortBindConnect(NULL,
        server.masterhost,server.masterport,NET_FIRST_BIND_ADDR);
    if (fd == -1) {
        return C_ERR;
    }//設置socket連接成功後的事件回調
    if (aeCreateFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE,syncWithMaster,NULL) ==
            AE_ERR)
    {
        close(fd);
        return C_ERR;
    }
    server.repl_transfer_lastio = server.unixtime;
    server.repl_transfer_s = fd;
    server.repl_state = REPL_STATE_CONNECTING;//更新repl_state,正在連接中
    return C_OK;
}

    syncWithMaster()發送ping命令,身份驗證,發送端口信息,通知master自己可以解析rdb,進行同步。基本流程如下:

void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {    
    ……
    //repl_state表示沒有活躍replication,直接返回
    if (server.repl_state == REPL_STATE_NONE) {
        close(fd);
        return;
    }
    ……
    //socket連接成功,發送ping給master
    if (server.repl_state == REPL_STATE_CONNECTING) {
        aeDeleteFileEvent(server.el,fd,AE_WRITABLE);
        server.repl_state = REPL_STATE_RECEIVE_PONG;//處於等待ping reply
        err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"PING",NULL);
        if (err) goto write_error;
        return;
    }//收到pong回覆,讀出pong回覆
    if (server.repl_state == REPL_STATE_RECEIVE_PONG) {
        err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
        if (err[0] != '+' &&strncmp(err,"-NOAUTH",7) != 0 &&
            strncmp(err,"-ERR operation not permitted",28) != 0)
        {
            ……
            goto error;
        } 
        server.repl_state = REPL_STATE_SEND_AUTH;//處於要發送auth狀態
    }
    //身份驗證,發送auth信息給master
    if (server.repl_state == REPL_STATE_SEND_AUTH) {
        if (server.masterauth) {
            err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"AUTH",server.masterauth,NULL);
            if (err) goto write_error;
            server.repl_state = REPL_STATE_RECEIVE_AUTH;
            return;
        } else {//不需要驗證,直接進入配置REPL_STATE_SEND_PORT狀態
            server.repl_state = REPL_STATE_SEND_PORT;
        }
    }//接收驗證的回覆,讀取驗證回覆信息
    if (server.repl_state == REPL_STATE_RECEIVE_AUTH) {        
        err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
        if (err[0] == '-') {
            serverLog(LL_WARNING,"Unable to AUTH to MASTER: %s",err);
            sdsfree(err);
            goto error;
        }
        server.repl_state = REPL_STATE_SEND_PORT;
    }//發送listening-port給master 
    if (server.repl_state == REPL_STATE_SEND_PORT) {
        sds port = sdsfromlonglong(server.port);
        err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
                "listening-port",port, NULL);
        ……
        server.repl_state = REPL_STATE_RECEIVE_PORT;
        return;
    }  
    if (server.repl_state == REPL_STATE_RECEIVE_PORT) {
     //接收"replconf listening-port"的回覆 
        err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
        if (err[0] == '-') {
            ……
        }
        server.repl_state = REPL_STATE_SEND_CAPA;
    }
    //告知master自己可以解析rdb的格式
    if (server.repl_state == REPL_STATE_SEND_CAPA) {           
        err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
                "capa","eof",NULL);
        if (err) goto write_error;
        sdsfree(err);
        server.repl_state = REPL_STATE_RECEIVE_CAPA;
        return;
    }//接收"replconf capa eof"的回覆 
    if (server.repl_state == REPL_STATE_RECEIVE_CAPA) {       
        err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
        if (err[0] == '-') {
            ……
        }
        sdsfree(err);
        server.repl_state = REPL_STATE_SEND_PSYNC;
    }
    //slave發送psync給master,如果有cached_matster,進行部分重同步;反之進行完整重同步
    if (server.repl_state == REPL_STATE_SEND_PSYNC) {
        if (slaveTryPartialResynchronization(fd,0) == PSYNC_WRITE_ERROR) {
            err = sdsnew("Write error sending the PSYNC command.");
            goto write_error;
        }
        server.repl_state = REPL_STATE_RECEIVE_PSYNC;
        return;
    }
    /* If reached this point, we should be in REPL_STATE_RECEIVE_PSYNC. */
    if (server.repl_state != REPL_STATE_RECEIVE_PSYNC) {
        ……
        goto error;
    }
    //接收master psync的回覆,進行同步
    psync_result = slaveTryPartialResynchronization(fd,1);
    if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */
    if (psync_result == PSYNC_CONTINUE) {
        serverLog(LL_NOTICE, "MASTER <-> SLAVE sync: Master accepted a Partial Resynchronization.");
        return;
    }
    disconnectSlaves(); /* Force our slaves to resync with us as well. */
    freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */
    //如果master不支持psync,就改用sync進行同步(老版本的同步機制)
    if (psync_result == PSYNC_NOT_SUPPORTED) {
        serverLog(LL_NOTICE,"Retrying with SYNC...");
        if (syncWrite(fd,"SYNC\r\n",6,server.repl_syncio_timeout*1000) == -1) {
            ……
            goto error;
        }
    }
    ……
    //設置事件回調讀取回復過來的同步數據
    if (aeCreateFileEvent(server.el,fd, AE_READABLE,readSyncBulkPayload,NULL)
            == AE_ERR)
    {
        ……
    }
    server.repl_state = REPL_STATE_TRANSFER;
    server.repl_transfer_size = -1;
    server.repl_transfer_read = 0;
    server.repl_transfer_last_fsync_off = 0;
    server.repl_transfer_fd = dfd;
    server.repl_transfer_lastio = server.unixtime;
    server.repl_transfer_tmpfile = zstrdup(tmpfile);
    return;
error:
    ……
write_error: /* Handle sendSynchronousCommand(SYNC_CMD_WRITE) errors. */
    ……
}
       接下來看看主從複製的調度中心replicationCron,主要負責監控主從複製過程中的各個狀態,

並根據不同情況作出不同處理。

//Replicationcron是複製的調度中心,由redis唯一timeEvent的回調函數serverCron每秒執行一次
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
    ……
	run_with_period(1000) replicationCron();
	……
}
void replicationCron(void) {
    static long long replication_cron_loops = 0;
    //slave非阻塞連接超時
    if (server.masterhost &&
        (server.repl_state == REPL_STATE_CONNECTING ||
         slaveIsInHandshakeState()) &&
         (time(NULL)-server.repl_transfer_lastio) > server.repl_timeout)
    {
        cancelReplicationHandshake();
    }
    //slave receiving .rdb超時
    if (server.masterhost && server.repl_state == REPL_STATE_TRANSFER &&
        (time(NULL)-server.repl_transfer_lastio) > server.repl_timeout)
    {
        cancelReplicationHandshake();
    }
    //slave連接上主服務器後出現交互超時
    if (server.masterhost && server.repl_state == REPL_STATE_CONNECTED &&
        (time(NULL)-server.master->lastinteraction) > server.repl_timeout)
    {
        freeClient(server.master);
    }
    //slave檢查是否需要連接主服務器
    if (server.repl_state == REPL_STATE_CONNECT) {
        serverLog(LL_NOTICE,"Connecting to MASTER %s:%d",
            server.masterhost, server.masterport);
        //建立與主服務器的套接字連接
        if (connectWithMaster() == C_OK) {
            serverLog(LL_NOTICE,"MASTER <-> SLAVE sync started");
        }
    }// slave發送ack給master 
    if (server.masterhost && server.master &&
        !(server.master->flags & CLIENT_PRE_PSYNC))
        replicationSendAck();
    listIter li;
    listNode *ln;
    robj *ping_argv[1];
    //master週期性發生ping給slave
    if ((replication_cron_loops % server.repl_ping_slave_period) == 0) {
        ping_argv[0] = createStringObject("PING",4);
        replicationFeedSlaves(server.slaves, server.slaveseldb,
            ping_argv, 1);
        decrRefCount(ping_argv[0]);
    }	
    listRewind(server.slaves,&li);
    while((ln = listNext(&li))) {
        client *slave = ln->value;
		//master發送一個空行給每個符合下面兩個條件的slave,refresh slave的last-io的timer
		//1、master需要產生一個rdb文件給slave
		//2、等待rdb文件完成,但還沒發給slave
        if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START ||
            (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END &&
             server.rdb_child_type != RDB_CHILD_TYPE_SOCKET))
        {
            if (write(slave->fd, "\n", 1) == -1) {
                /* Don't worry, it's just a ping. */
            }
        }
    }//master斷開slave的連接
    if (listLength(server.slaves)) {
        listIter li;
        listNode *ln;
        listRewind(server.slaves,&li);
        while((ln = listNext(&li))) {
            client *slave = ln->value;
            if (slave->replstate != SLAVE_STATE_ONLINE) continue;
            if (slave->flags & CLIENT_PRE_PSYNC) continue;
            if ((server.unixtime - slave->repl_ack_time) > server.repl_timeout)
            {
                freeClient(slave);
            }
        }
    }//master沒有slave,就釋放掉repl_backlog的內存
    if (listLength(server.slaves) == 0 && server.repl_backlog_time_limit &&
        server.repl_backlog)
    {
        time_t idle = server.unixtime - server.repl_no_slaves_since;

        if (idle > server.repl_backlog_time_limit) {
            freeReplicationBacklog();
        }
    }//master的aof功能關閉而且沒有slaves,就釋放scriptcache
    if (listLength(server.slaves) == 0 &&
        server.aof_state == AOF_OFF &&
        listLength(server.repl_scriptcache_fifo) != 0)
    {
        replicationScriptCacheFlush();
    }//master沒有在進行持久化操作
    if (server.rdb_child_pid == -1 && server.aof_child_pid == -1) {
        time_t idle, max_idle = 0;
        int slaves_waiting = 0;
        int mincapa = -1;
        listNode *ln;
        listIter li;
        listRewind(server.slaves,&li);
        //統計slaves中處於wait_bgsave_star的數量,最大超時時間和rdb解析能力
        while((ln = listNext(&li))) {
            client *slave = ln->value;
            if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) {
                idle = server.unixtime - slave->lastinteraction;
                if (idle > max_idle) max_idle = idle;
                slaves_waiting++;
                mincapa = (mincapa == -1) ? slave->slave_capa :
                                            (mincapa & slave->slave_capa);
            }
        }		
        if (slaves_waiting && max_idle > server.repl_diskless_sync_delay) {
            //有超時的處於SLAVE_STATE_WAIT_BGSAVE_START的slave
            startBgsaveForReplication(mincapa);
        }
    }
    //刷新延遲小於閾值的slave的數量
    refreshGoodSlavesCount();
    replication_cron_loops++; /* Incremented with frequency 1 HZ. */
}
replicantion.c的主要函數
 /* ---------------------------------- MASTER -------------------------------- */  
void createReplicationBacklog(void) /* 創建複製積壓緩衝區 */  
void resizeReplicationBacklog(long long newsize) /* 調整複製積壓緩衝區的大小*/  
void freeReplicationBacklog(void) /* 釋放複製積壓緩衝區*/  
void feedReplicationBacklog(void *ptr, size_t len) /* 將寫命令添加到複製積壓緩衝區*/  
void feedReplicationBacklogWithObject(robj *o) /*將寫命令添加到複製積壓緩衝區,但以對象的格式作爲參數 */ 
void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) /* 將主數據庫複製到從數據庫 */  
void replicationFeedMonitors(redisClient *c, list *monitors, int dictid, 
robj **argv, int argc) /* 發送數據給monitor監聽者 */  
long long addReplyReplicationBacklog(redisClient *c, long long offset) 
/* 將複製積壓緩衝區的offset到end的添加client的reply*/
int masterTryPartialResynchronization(redisClient *c) /* 主服務器嘗試部分重同步 */  
void syncCommand(redisClient *c) /* 同步命令函數 */  
void replconfCommand(redisClient *c) /* 此函數用於從服務器進行配置複製進程中的執行參數設置 */  
void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) /* 給slave發送BULK數據 */  
void updateSlavesWaitingBgsave(int bgsaveerr, int type) /* 此方法將用於後臺保存進程快結束時調用,更新slave */        
/* ----------------------------------- SLAVE -------------------------------- */  
void replicationAbortSyncTransfer(void) /* 中止與master的同步操作 */  
void replicationSendNewlineToMaster(void)  
void replicationEmptyDbCallback(void *privdata)  
void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) 
/* 從服務器讀取同步的Sync的BULK數據 */  
char *sendSynchronousCommand(int flags, int fd, ...)  /* 從服務器給主服務器進行同步數據的命令和接收相應的回覆 */  
int slaveTryPartialResynchronization(int fd) /* 從服務器嘗試部分重同步操作 */  
void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) 
/* 與主服務器保持同步,期間包括髮送ping命令,身份驗證,發送端口信息 */  
int connectWithMaster(void) /* 連接服務器,設置事件回調 syncWithMaster*/  
void undoConnectWithMaster(void) /* 斷開與主服務器的連接 */  
int cancelReplicationHandshake(void) /* 當已經存在一個複製進程時,中止一個非阻塞的replication複製的嘗試 */  
void replicationSetMaster(char *ip, int port) /* 設置主服務器的ip地址和端口號 */  
void replicationUnsetMaster(void)  
void slaveofCommand(redisClient *c)  
void roleCommand(redisClient *c)  
void replicationSendAck(void) /* 發送ACK包給主服務器 ,告知當前的進程偏移量 */       
/* ---------------------- MASTER CACHING FOR PSYNC -------------------------- */  
void replicationCacheMaster(redisClient *c) /* 緩存主服務器信息 */  
void replicationDiscardCachedMaster(void) /* 當某個從服務器將不會再回復的時候,可以釋放掉緩存的主服務器信息 */  
void replicationResurrectCachedMaster(int newfd) /* 將緩存主服務器復活 */       
/* ------------------------- MIN-SLAVES-TO-WRITE  --------------------------- */  
void refreshGoodSlavesCount(void) /*刷新延遲小於閾值的slave的數量*/  
void replicationScriptCacheInit(void)  
void replicationScriptCacheFlush(void)  
void replicationScriptCacheAdd(sds sha1)  
int replicationScriptCacheExists(sds sha1)  
void replicationCron(void) //主從複製的調度中心

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章