monitor選主的下半段就是真正的選主
發送OP_PROPOSE請求
void Monitor::start_election()
_reset();
elector.call_election();
start();
acked_me.clear();
init();
epoch = mon->store->get(Monitor::MONITOR_NAME, "election_epoch");
if (!epoch)
epoch = 1;
// 如果epoch爲奇數,則說明正處在選舉階段,將epoch+1,以跨過這個階段
else if (epoch % 2)
++epoch;
t->put(Monitor::MONITOR_NAME, "election_epoch", epoch);
mon->store->apply_transaction(t);
// 如果爲偶數,則加1,表示處於選舉階段
if (epoch % 2 == 0)
bump_epoch(epoch+1);
electing_me = true;
leader_acked = -1;
for (unsigned i=0; i<mon->monmap->size(); ++i)
if ((int)i == mon->rank) continue;
MMonElection *m = new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap);
mon->messenger->send_message(m, mon->monmap->get_inst(i));
reset_timer();
主動啓動選舉的節點向monmap中的其它節點發送OP_PROPOSE請求,並將選舉的epoch加1,置爲奇數。
處理OP_PROPOSE請求
void Elector::handle_propose(MonOpRequestRef op)
MMonElection *m = static_cast<MMonElection*>(op->get_req());
int from = m->get_source().num();
// 對方的選舉版本大於自己
if (m->epoch > epoch)
bump_epoch(m->epoch);
epoch = e;
t->put(Monitor::MONITOR_NAME, "election_epoch", epoch);
mon->store->apply_transaction(t);
mon->join_election();
electing_me = false;
acked_me.clear();
// 對方的選舉版本小於自己
else if (m->epoch < epoch)
if (epoch % 2 == 0 && mon->quorum.count(from) == 0)
mon->start_election();
else
dout(5) << " ignoring old propose" << dendl;
return;
// 即使選舉,也是本節點贏
if (mon->rank < from)
if (leader_acked >= 0)
assert(leader_acked < from);
else
if (!electing_me)
mon->start_election();
else
if (leader_acked < 0 || leader_acked > from || leader_acked == from)
// 對方會贏得選舉
defer(from);
if (electing_me)
acked_me.clear();
electing_me = false;
// ack them
leader_acked = who;
MMonElection *m = new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap);
m->mon_features = ceph::features::mon::get_supported();
mon->collect_metadata(&m->metadata);
mon->messenger->send_message(m, mon->monmap->get_inst(who));
// set a timer
reset_timer(1.0);
else
dout(5) << "no, we already acked " << leader_acked << dendl;
其它節點收到OP_PROPOSE請求後,
(1) 如果對方的選舉版本大於自己,則將自己的選舉版本設置爲對方的選舉版本。
(2) 對方的選舉版本小於自己,並且滿足自己不處於選舉階段和對方不處於本節點的quorum緩存裏,則說明對方可能是新加入的節點,這種情況下自己主動開啓選舉,以便讓其加入到quorum。
如果沒有忽略該請求,繼續採取如下行爲:
(1) 如果自己的rank小於對方,則一定不會選舉對方爲主節點,如果這時本節點沒有迴應過其它節點,則自己會發起選舉。
(2) 如果對方的rank小於自己,並且對方的rank小於等於自己已經迴應過的節點,則選舉對方爲主節點。
處理ACK
void Elector::handle_ack(MonOpRequestRef op)
// 本節點發起的選舉請求,要求選本節點
if (electing_me)
acked_me[from].cluster_features = m->get_connection()->get_features();
acked_me[from].mon_features = m->mon_features;
acked_me[from].metadata = m->metadata;
// 要求monmap中的全部節點都同意我作爲leader纔可以
if (acked_me.size() == mon->monmap->size())
victory();
assert(leader_acked >= 0);
收到OP_ACK後,將回應的節點插入到acked_me中,如果acked_me的大小和monmap的大小相同,則說明全部節點都同意我作爲主節點。
victory()
bump_epoch(epoch+1); // is over! 偶數結束
for (map<int, elector_info_t>::iterator p = acked_me.begin(); p != acked_me.end(); ++p) {
quorum.insert(p->first);
for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
if (*p == mon->rank) continue;
MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch, mon->monmap);
m->quorum = quorum;
mon->messenger->send_message(m, mon->monmap->get_inst(*p));
mon->win_election(epoch, quorum, cluster_features, mon_features, metadata);
state = STATE_LEADER;
leader_since = ceph_clock_now();
leader = rank;
quorum = active;
quorum_con_features = features;
quorum_mon_features = mon_features;
pending_metadata = metadata;
outside_quorum.clear();
paxos->leader_init();
(1) 將選舉版本加1,變爲偶數。
(2) 將acked_me的節點賦值給quorum。
(3) 對quorum的每個節點,發送OP_VICTORY消息。
(4) 初始化paxos,主要是提交上次沒有提交的消息。
leader_init
pending_proposal.reset();
state = STATE_RECOVERING;
lease_expire = utime_t();
collect(0); // paxos裏的函數
// 有爲確認的消息
if (get_store()->exists(get_name(), last_committed+1))
version_t v = get_store()->get(get_name(), "pending_v");
version_t pn = get_store()->get(get_name(), "pending_pn");
if (v && pn && v == last_committed + 1)
uncommitted_pn = pn;
else
uncommitted_pn = accepted_pn;
uncommitted_v = last_committed+1;
get_store()->get(get_name(), last_committed+1, uncommitted_value);
// 生成新的accepted_pn,只有在每次選舉成功才能生成新的
accepted_pn = get_new_proposal_number(std::max(accepted_pn, oldpn));
accepted_pn_from = last_committed;
for (set<int>::const_iterator p = mon->get_quorum().begin(); p != mon->get_quorum().end(); ++p)
if (*p == mon->rank) continue;
MMonPaxos *collect = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COLLECT, ceph_clock_now());
collect->last_committed = last_committed;
collect->first_committed = first_committed;
collect->pn = accepted_pn;
mon->messenger->send_message(collect, mon->monmap->get_inst(*p));
(1) 如果存在last_committed+1版本的日誌,則說明存在沒有完成同步的消息,則獲取pending_v(未提交日誌的版本號)、pending_pn(爲提交日誌的accepted_pn)、和last_committed+1的日誌
(2) 生成新的accepted_pn,該值只在每次選舉完成後重新生成。
(3) 向quorum其它節點發送OP_COLLECT消息。
副節點處理OP_COLLECT消息
void Paxos::handle_collect(MonOpRequestRef op)
state = STATE_RECOVERING;
MMonPaxos *last = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LAST, ceph_clock_now());
last->last_committed = last_committed;
last->first_committed = first_committed;
version_t previous_pn = accepted_pn;
// 接受對方的accepted_pn
if (collect->pn > accepted_pn)
accepted_pn = collect->pn;
accepted_pn_from = collect->pn_from;
t->put(get_name(), "accepted_pn", accepted_pn);
get_store()->apply_transaction(t);
last->pn = accepted_pn;
last->pn_from = accepted_pn_from;
if (collect->last_committed < last_committed)
share_state(last, collect->first_committed, collect->last_committed);
version_t v = peer_last_committed + 1;
for ( ; v <= last_committed; v++)
get_store()->get(get_name(), v, m->values[v]);
m->last_committed = last_committed;
if (collect->last_committed <= last_committed && get_store()->exists(get_name(), last_committed+1))
get_store()->get(get_name(), last_committed+1, bl);
last->values[last_committed+1] = bl;
version_t v = get_store()->get(get_name(), "pending_v");
version_t pn = get_store()->get(get_name(), "pending_pn");
if (v && pn && v == last_committed + 1)
last->uncommitted_pn = pn;
else
last->uncommitted_pn = previous_pn;
collect->get_connection()->send_message(last);
副節點收到OP_COLLECT消息後,如果主節點的的last_committed小於自己的,說明主節點缺失部分日誌,就將缺失的這段日誌分享給主節點,如果自己有未確認的日誌,則一併發送給對方。
主節點處理OP_LAST消息
void Paxos::handle_last(MonOpRequestRef op)
peer_first_committed[from] = last->first_committed;
peer_last_committed[from] = last->last_committed;
// 如果對方的日誌更新於本節點,則在本節點持久化缺失的日誌
need_refresh = store_state(last);
// 之所以每次副節點的LAST消息到來都要遍歷peer_last_committed,因此每次有新的LAST消息到來都可能會改變
// 本節點的日誌,所以每次都需要和LAST消息已被接收的節點比較,如果對方的日誌太舊,則更新
for (map<int,version_t>::iterator p = peer_last_committed.begin(); p != peer_last_committed.end(); ++p)
// 對方收到OP_PROBE時,檢測到自己的版本太落後,會bootstrap
if (p->second + 1 < first_committed && first_committed > 1)
mon->bootstrap();
return;
// 對方的日誌不是太落後,則直接在此更新
if (p->second < last_committed)
MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, ceph_clock_now());
share_state(commit, peer_first_committed[p->first], p->second);
mon->messenger->send_message(commit, mon->monmap->get_inst(p->first));
if (last->pn > accepted_pn)
collect(last->pn);
else if (last->pn == accepted_pn)
num_last++;
if (last->uncommitted_pn)
if (last->uncommitted_pn >= uncommitted_pn && last->last_committed >= last_committed && last->last_committed + 1 >= uncommitted_v)
uncommitted_v = last->last_committed+1;
uncommitted_pn = last->uncommitted_pn;
uncommitted_value = last->values[uncommitted_v];
if (num_last == mon->get_quorum().size())
if (uncommitted_v == last_committed+1 && uncommitted_value.length())
state = STATE_UPDATING_PREVIOUS;
begin(uncommitted_value);
else
extend_lease();
主節點收到OP_LAST消息後,如果副節點的日誌比自己新,則更新自己的日誌。並遍歷peer_last_committed,對比本節點更新後的日誌和其它節點的日誌,如果其它節點的日誌過於落後(last_committed小於first_committed),本節點重新調用bootstrap,bootstrap會發送OP_PROBE消息,在對方收到OP_PROBE消息時,檢測到自己的日誌太過落後,就會主動調用bootstrap。如果peer_last_committed中其它節點的日誌稍微落後於本節點,就主動將缺失的日誌發送給對方,不需要重新調用bootstrap。
如果副節點的的accepted_pn大於主節點的accepted_pn,則主節點重新collect,會重新生成新的accepted_pn。
如果副節點的accepted_pn等於本節點的accepted_pn,則說明副節點接受的本節點的accepted_pn。判斷副節點是否有未提交併且版本大於主節點未提交的日誌,如果有,則將這個未提交的日誌廣播出去,這是通過正常的paxos過程實現。如果沒有未提交的日誌,就調用extend_lease擴展副本的租約。