對於replication.c的源碼分析,我將會分兩部分介紹主從複製的過程和主從同步的複製《redis replication主
從複製的源碼分析(2)》。本文主要分析slave連接master進行主從複製的過程實現。
redis-cli通過向從服務器發送slaveof命令,可以使從服務器去複製一個主服務器:
slaveof <master_ip> <master_port>
主從複製的詳細的步驟如下:
1、設置主服務器的地址和端口
2、建立套接字連接
3、發送ping命令4、身份驗證
5、發送端口信息6、同步
7、命令傳播
replicationSetMaster() 設置主服務器的地址和端口,初始化replication狀態
void replicationSetMaster(char *ip, int port) {
sdsfree(server.masterhost);
server.masterhost = sdsnew(ip);
server.masterport = port;
if (server.master) freeClient(server.master);
disconnectAllBlockedClients(); /* Clients blocked in master, now slave. */
disconnectSlaves(); /* Force our slaves to resync with us as well. */
replicationDiscardCachedMaster(); /* Don't try a PSYNC. */
freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */
cancelReplicationHandshake();
server.repl_state = REPL_STATE_CONNECT;//設置repl_state,準備開始replication
server.master_repl_offset = 0;//初始化replication的偏移
server.repl_down_since = 0;
}
connectWithMaster()建立套接字連接,設置事件回調syncWithMaster()
int connectWithMaster(void) {
int fd;//創建與master的socket
fd = anetTcpNonBlockBestEffortBindConnect(NULL,
server.masterhost,server.masterport,NET_FIRST_BIND_ADDR);
if (fd == -1) {
return C_ERR;
}//設置socket連接成功後的事件回調
if (aeCreateFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE,syncWithMaster,NULL) ==
AE_ERR)
{
close(fd);
return C_ERR;
}
server.repl_transfer_lastio = server.unixtime;
server.repl_transfer_s = fd;
server.repl_state = REPL_STATE_CONNECTING;//更新repl_state,正在連接中
return C_OK;
}
syncWithMaster()發送ping命令,身份驗證,發送端口信息,通知master自己可以解析rdb,進行同步。基本流程如下:
void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
……
//repl_state表示沒有活躍replication,直接返回
if (server.repl_state == REPL_STATE_NONE) {
close(fd);
return;
}
……
//socket連接成功,發送ping給master
if (server.repl_state == REPL_STATE_CONNECTING) {
aeDeleteFileEvent(server.el,fd,AE_WRITABLE);
server.repl_state = REPL_STATE_RECEIVE_PONG;//處於等待ping reply
err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"PING",NULL);
if (err) goto write_error;
return;
}//收到pong回覆,讀出pong回覆
if (server.repl_state == REPL_STATE_RECEIVE_PONG) {
err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
if (err[0] != '+' &&strncmp(err,"-NOAUTH",7) != 0 &&
strncmp(err,"-ERR operation not permitted",28) != 0)
{
……
goto error;
}
server.repl_state = REPL_STATE_SEND_AUTH;//處於要發送auth狀態
}
//身份驗證,發送auth信息給master
if (server.repl_state == REPL_STATE_SEND_AUTH) {
if (server.masterauth) {
err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"AUTH",server.masterauth,NULL);
if (err) goto write_error;
server.repl_state = REPL_STATE_RECEIVE_AUTH;
return;
} else {//不需要驗證,直接進入配置REPL_STATE_SEND_PORT狀態
server.repl_state = REPL_STATE_SEND_PORT;
}
}//接收驗證的回覆,讀取驗證回覆信息
if (server.repl_state == REPL_STATE_RECEIVE_AUTH) {
err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
if (err[0] == '-') {
serverLog(LL_WARNING,"Unable to AUTH to MASTER: %s",err);
sdsfree(err);
goto error;
}
server.repl_state = REPL_STATE_SEND_PORT;
}//發送listening-port給master
if (server.repl_state == REPL_STATE_SEND_PORT) {
sds port = sdsfromlonglong(server.port);
err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
"listening-port",port, NULL);
……
server.repl_state = REPL_STATE_RECEIVE_PORT;
return;
}
if (server.repl_state == REPL_STATE_RECEIVE_PORT) {
//接收"replconf listening-port"的回覆
err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
if (err[0] == '-') {
……
}
server.repl_state = REPL_STATE_SEND_CAPA;
}
//告知master自己可以解析rdb的格式
if (server.repl_state == REPL_STATE_SEND_CAPA) {
err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
"capa","eof",NULL);
if (err) goto write_error;
sdsfree(err);
server.repl_state = REPL_STATE_RECEIVE_CAPA;
return;
}//接收"replconf capa eof"的回覆
if (server.repl_state == REPL_STATE_RECEIVE_CAPA) {
err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
if (err[0] == '-') {
……
}
sdsfree(err);
server.repl_state = REPL_STATE_SEND_PSYNC;
}
//slave發送psync給master,如果有cached_matster,進行部分重同步;反之進行完整重同步
if (server.repl_state == REPL_STATE_SEND_PSYNC) {
if (slaveTryPartialResynchronization(fd,0) == PSYNC_WRITE_ERROR) {
err = sdsnew("Write error sending the PSYNC command.");
goto write_error;
}
server.repl_state = REPL_STATE_RECEIVE_PSYNC;
return;
}
/* If reached this point, we should be in REPL_STATE_RECEIVE_PSYNC. */
if (server.repl_state != REPL_STATE_RECEIVE_PSYNC) {
……
goto error;
}
//接收master psync的回覆,進行同步
psync_result = slaveTryPartialResynchronization(fd,1);
if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */
if (psync_result == PSYNC_CONTINUE) {
serverLog(LL_NOTICE, "MASTER <-> SLAVE sync: Master accepted a Partial Resynchronization.");
return;
}
disconnectSlaves(); /* Force our slaves to resync with us as well. */
freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */
//如果master不支持psync,就改用sync進行同步(老版本的同步機制)
if (psync_result == PSYNC_NOT_SUPPORTED) {
serverLog(LL_NOTICE,"Retrying with SYNC...");
if (syncWrite(fd,"SYNC\r\n",6,server.repl_syncio_timeout*1000) == -1) {
……
goto error;
}
}
……
//設置事件回調讀取回復過來的同步數據
if (aeCreateFileEvent(server.el,fd, AE_READABLE,readSyncBulkPayload,NULL)
== AE_ERR)
{
……
}
server.repl_state = REPL_STATE_TRANSFER;
server.repl_transfer_size = -1;
server.repl_transfer_read = 0;
server.repl_transfer_last_fsync_off = 0;
server.repl_transfer_fd = dfd;
server.repl_transfer_lastio = server.unixtime;
server.repl_transfer_tmpfile = zstrdup(tmpfile);
return;
error:
……
write_error: /* Handle sendSynchronousCommand(SYNC_CMD_WRITE) errors. */
……
}
接下來看看主從複製的調度中心replicationCron,主要負責監控主從複製過程中的各個狀態,
並根據不同情況作出不同處理。
//Replicationcron是複製的調度中心,由redis唯一timeEvent的回調函數serverCron每秒執行一次
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
……
run_with_period(1000) replicationCron();
……
}
void replicationCron(void) {
static long long replication_cron_loops = 0;
//slave非阻塞連接超時
if (server.masterhost &&
(server.repl_state == REPL_STATE_CONNECTING ||
slaveIsInHandshakeState()) &&
(time(NULL)-server.repl_transfer_lastio) > server.repl_timeout)
{
cancelReplicationHandshake();
}
//slave receiving .rdb超時
if (server.masterhost && server.repl_state == REPL_STATE_TRANSFER &&
(time(NULL)-server.repl_transfer_lastio) > server.repl_timeout)
{
cancelReplicationHandshake();
}
//slave連接上主服務器後出現交互超時
if (server.masterhost && server.repl_state == REPL_STATE_CONNECTED &&
(time(NULL)-server.master->lastinteraction) > server.repl_timeout)
{
freeClient(server.master);
}
//slave檢查是否需要連接主服務器
if (server.repl_state == REPL_STATE_CONNECT) {
serverLog(LL_NOTICE,"Connecting to MASTER %s:%d",
server.masterhost, server.masterport);
//建立與主服務器的套接字連接
if (connectWithMaster() == C_OK) {
serverLog(LL_NOTICE,"MASTER <-> SLAVE sync started");
}
}// slave發送ack給master
if (server.masterhost && server.master &&
!(server.master->flags & CLIENT_PRE_PSYNC))
replicationSendAck();
listIter li;
listNode *ln;
robj *ping_argv[1];
//master週期性發生ping給slave
if ((replication_cron_loops % server.repl_ping_slave_period) == 0) {
ping_argv[0] = createStringObject("PING",4);
replicationFeedSlaves(server.slaves, server.slaveseldb,
ping_argv, 1);
decrRefCount(ping_argv[0]);
}
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
//master發送一個空行給每個符合下面兩個條件的slave,refresh slave的last-io的timer
//1、master需要產生一個rdb文件給slave
//2、等待rdb文件完成,但還沒發給slave
if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START ||
(slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END &&
server.rdb_child_type != RDB_CHILD_TYPE_SOCKET))
{
if (write(slave->fd, "\n", 1) == -1) {
/* Don't worry, it's just a ping. */
}
}
}//master斷開slave的連接
if (listLength(server.slaves)) {
listIter li;
listNode *ln;
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
if (slave->replstate != SLAVE_STATE_ONLINE) continue;
if (slave->flags & CLIENT_PRE_PSYNC) continue;
if ((server.unixtime - slave->repl_ack_time) > server.repl_timeout)
{
freeClient(slave);
}
}
}//master沒有slave,就釋放掉repl_backlog的內存
if (listLength(server.slaves) == 0 && server.repl_backlog_time_limit &&
server.repl_backlog)
{
time_t idle = server.unixtime - server.repl_no_slaves_since;
if (idle > server.repl_backlog_time_limit) {
freeReplicationBacklog();
}
}//master的aof功能關閉而且沒有slaves,就釋放scriptcache
if (listLength(server.slaves) == 0 &&
server.aof_state == AOF_OFF &&
listLength(server.repl_scriptcache_fifo) != 0)
{
replicationScriptCacheFlush();
}//master沒有在進行持久化操作
if (server.rdb_child_pid == -1 && server.aof_child_pid == -1) {
time_t idle, max_idle = 0;
int slaves_waiting = 0;
int mincapa = -1;
listNode *ln;
listIter li;
listRewind(server.slaves,&li);
//統計slaves中處於wait_bgsave_star的數量,最大超時時間和rdb解析能力
while((ln = listNext(&li))) {
client *slave = ln->value;
if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) {
idle = server.unixtime - slave->lastinteraction;
if (idle > max_idle) max_idle = idle;
slaves_waiting++;
mincapa = (mincapa == -1) ? slave->slave_capa :
(mincapa & slave->slave_capa);
}
}
if (slaves_waiting && max_idle > server.repl_diskless_sync_delay) {
//有超時的處於SLAVE_STATE_WAIT_BGSAVE_START的slave
startBgsaveForReplication(mincapa);
}
}
//刷新延遲小於閾值的slave的數量
refreshGoodSlavesCount();
replication_cron_loops++; /* Incremented with frequency 1 HZ. */
}
replicantion.c的主要函數 /* ---------------------------------- MASTER -------------------------------- */
void createReplicationBacklog(void) /* 創建複製積壓緩衝區 */
void resizeReplicationBacklog(long long newsize) /* 調整複製積壓緩衝區的大小*/
void freeReplicationBacklog(void) /* 釋放複製積壓緩衝區*/
void feedReplicationBacklog(void *ptr, size_t len) /* 將寫命令添加到複製積壓緩衝區*/
void feedReplicationBacklogWithObject(robj *o) /*將寫命令添加到複製積壓緩衝區,但以對象的格式作爲參數 */
void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) /* 將主數據庫複製到從數據庫 */
void replicationFeedMonitors(redisClient *c, list *monitors, int dictid,
robj **argv, int argc) /* 發送數據給monitor監聽者 */
long long addReplyReplicationBacklog(redisClient *c, long long offset)
/* 將複製積壓緩衝區的offset到end的添加client的reply*/
int masterTryPartialResynchronization(redisClient *c) /* 主服務器嘗試部分重同步 */
void syncCommand(redisClient *c) /* 同步命令函數 */
void replconfCommand(redisClient *c) /* 此函數用於從服務器進行配置複製進程中的執行參數設置 */
void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) /* 給slave發送BULK數據 */
void updateSlavesWaitingBgsave(int bgsaveerr, int type) /* 此方法將用於後臺保存進程快結束時調用,更新slave */
/* ----------------------------------- SLAVE -------------------------------- */
void replicationAbortSyncTransfer(void) /* 中止與master的同步操作 */
void replicationSendNewlineToMaster(void)
void replicationEmptyDbCallback(void *privdata)
void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask)
/* 從服務器讀取同步的Sync的BULK數據 */
char *sendSynchronousCommand(int flags, int fd, ...) /* 從服務器給主服務器進行同步數據的命令和接收相應的回覆 */
int slaveTryPartialResynchronization(int fd) /* 從服務器嘗試部分重同步操作 */
void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask)
/* 與主服務器保持同步,期間包括髮送ping命令,身份驗證,發送端口信息 */
int connectWithMaster(void) /* 連接服務器,設置事件回調 syncWithMaster*/
void undoConnectWithMaster(void) /* 斷開與主服務器的連接 */
int cancelReplicationHandshake(void) /* 當已經存在一個複製進程時,中止一個非阻塞的replication複製的嘗試 */
void replicationSetMaster(char *ip, int port) /* 設置主服務器的ip地址和端口號 */
void replicationUnsetMaster(void)
void slaveofCommand(redisClient *c)
void roleCommand(redisClient *c)
void replicationSendAck(void) /* 發送ACK包給主服務器 ,告知當前的進程偏移量 */
/* ---------------------- MASTER CACHING FOR PSYNC -------------------------- */
void replicationCacheMaster(redisClient *c) /* 緩存主服務器信息 */
void replicationDiscardCachedMaster(void) /* 當某個從服務器將不會再回復的時候,可以釋放掉緩存的主服務器信息 */
void replicationResurrectCachedMaster(int newfd) /* 將緩存主服務器復活 */
/* ------------------------- MIN-SLAVES-TO-WRITE --------------------------- */
void refreshGoodSlavesCount(void) /*刷新延遲小於閾值的slave的數量*/
void replicationScriptCacheInit(void)
void replicationScriptCacheFlush(void)
void replicationScriptCacheAdd(sds sha1)
int replicationScriptCacheExists(sds sha1)
void replicationCron(void) //主從複製的調度中心