Ceph Monitor選主(下)

monitor選主的下半段就是真正的選主

發送OP_PROPOSE請求

void Monitor::start_election()
	_reset();
	elector.call_election();	
		start();
			acked_me.clear();
			init();
				epoch = mon->store->get(Monitor::MONITOR_NAME, "election_epoch");
				if (!epoch) 
					epoch = 1;
				// 如果epoch爲奇數,則說明正處在選舉階段,將epoch+1,以跨過這個階段
				else if (epoch % 2)
					++epoch;
					t->put(Monitor::MONITOR_NAME, "election_epoch", epoch);
					mon->store->apply_transaction(t);
			// 如果爲偶數,則加1,表示處於選舉階段
			if (epoch % 2 == 0)
				bump_epoch(epoch+1);
			electing_me = true;
			leader_acked = -1;
			for (unsigned i=0; i<mon->monmap->size(); ++i)
				if ((int)i == mon->rank) continue;
				MMonElection *m = new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap);
				mon->messenger->send_message(m, mon->monmap->get_inst(i));
			reset_timer();

主動啓動選舉的節點向monmap中的其它節點發送OP_PROPOSE請求,並將選舉的epoch加1,置爲奇數。

處理OP_PROPOSE請求

void Elector::handle_propose(MonOpRequestRef op)
	MMonElection *m = static_cast<MMonElection*>(op->get_req());
	int from = m->get_source().num();
	// 對方的選舉版本大於自己
	if (m->epoch > epoch)
		bump_epoch(m->epoch);
			epoch = e;
			t->put(Monitor::MONITOR_NAME, "election_epoch", epoch);
			mon->store->apply_transaction(t);
			mon->join_election();
			electing_me = false;
			acked_me.clear();
	// 對方的選舉版本小於自己
	else if (m->epoch < epoch)
		if (epoch % 2 == 0 && mon->quorum.count(from) == 0)
			mon->start_election();
		else
			dout(5) << " ignoring old propose" << dendl;
			return;
	// 即使選舉,也是本節點贏
	if (mon->rank < from)
		if (leader_acked >= 0)
			assert(leader_acked < from); 
		else
			if (!electing_me)
				mon->start_election();
	else
		if (leader_acked < 0 || leader_acked > from || leader_acked == from)
			// 對方會贏得選舉
			defer(from);
				if (electing_me)
					acked_me.clear();
					electing_me = false;
				// ack them
				leader_acked = who;
				MMonElection *m = new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap);
				m->mon_features = ceph::features::mon::get_supported();
				mon->collect_metadata(&m->metadata);
				mon->messenger->send_message(m, mon->monmap->get_inst(who));
				// set a timer
				reset_timer(1.0); 
		else
			dout(5) << "no, we already acked " << leader_acked << dendl;

其它節點收到OP_PROPOSE請求後,
(1) 如果對方的選舉版本大於自己,則將自己的選舉版本設置爲對方的選舉版本。
(2) 對方的選舉版本小於自己,並且滿足自己不處於選舉階段和對方不處於本節點的quorum緩存裏,則說明對方可能是新加入的節點,這種情況下自己主動開啓選舉,以便讓其加入到quorum。

如果沒有忽略該請求,繼續採取如下行爲:
(1) 如果自己的rank小於對方,則一定不會選舉對方爲主節點,如果這時本節點沒有迴應過其它節點,則自己會發起選舉。
(2) 如果對方的rank小於自己,並且對方的rank小於等於自己已經迴應過的節點,則選舉對方爲主節點。

處理ACK

void Elector::handle_ack(MonOpRequestRef op)
	// 本節點發起的選舉請求,要求選本節點
	if (electing_me) 
		acked_me[from].cluster_features = m->get_connection()->get_features();
		acked_me[from].mon_features = m->mon_features;
		acked_me[from].metadata = m->metadata;
		// 要求monmap中的全部節點都同意我作爲leader纔可以
		if (acked_me.size() == mon->monmap->size()) 
			victory();	
		assert(leader_acked >= 0);

收到OP_ACK後,將回應的節點插入到acked_me中,如果acked_me的大小和monmap的大小相同,則說明全部節點都同意我作爲主節點。

victory()
	bump_epoch(epoch+1);     // is over!  偶數結束
	for (map<int, elector_info_t>::iterator p = acked_me.begin(); p != acked_me.end(); ++p) {
		quorum.insert(p->first);
	for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p) 
		if (*p == mon->rank) continue;
		MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch, mon->monmap);
		m->quorum = quorum;
		mon->messenger->send_message(m, mon->monmap->get_inst(*p));
	mon->win_election(epoch, quorum, cluster_features, mon_features, metadata);
		state = STATE_LEADER;
		leader_since = ceph_clock_now();
		leader = rank;
		quorum = active;
		quorum_con_features = features;
		quorum_mon_features = mon_features;
		pending_metadata = metadata;
		outside_quorum.clear();		
		paxos->leader_init();	

(1) 將選舉版本加1,變爲偶數。
(2) 將acked_me的節點賦值給quorum。
(3) 對quorum的每個節點,發送OP_VICTORY消息。
(4) 初始化paxos,主要是提交上次沒有提交的消息。

leader_init	
	pending_proposal.reset();
	state = STATE_RECOVERING;
	lease_expire = utime_t();
	collect(0); // paxos裏的函數
		// 有爲確認的消息
		if (get_store()->exists(get_name(), last_committed+1))
			version_t v = get_store()->get(get_name(), "pending_v");
			version_t pn = get_store()->get(get_name(), "pending_pn");
			if (v && pn && v == last_committed + 1)
				uncommitted_pn = pn;
			else
				uncommitted_pn = accepted_pn;
			uncommitted_v = last_committed+1;
			get_store()->get(get_name(), last_committed+1, uncommitted_value);
		// 生成新的accepted_pn,只有在每次選舉成功才能生成新的
		accepted_pn = get_new_proposal_number(std::max(accepted_pn, oldpn));
		accepted_pn_from = last_committed;
		for (set<int>::const_iterator p = mon->get_quorum().begin(); p != mon->get_quorum().end(); ++p)
			if (*p == mon->rank) continue;
			MMonPaxos *collect = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COLLECT,  ceph_clock_now());
			collect->last_committed = last_committed;
			collect->first_committed = first_committed;
			collect->pn = accepted_pn;
			mon->messenger->send_message(collect, mon->monmap->get_inst(*p));	

(1) 如果存在last_committed+1版本的日誌,則說明存在沒有完成同步的消息,則獲取pending_v(未提交日誌的版本號)、pending_pn(爲提交日誌的accepted_pn)、和last_committed+1的日誌
(2) 生成新的accepted_pn,該值只在每次選舉完成後重新生成。
(3) 向quorum其它節點發送OP_COLLECT消息。

副節點處理OP_COLLECT消息

void Paxos::handle_collect(MonOpRequestRef op)
	state = STATE_RECOVERING;
	MMonPaxos *last = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LAST,  ceph_clock_now());
	last->last_committed = last_committed;
	last->first_committed = first_committed;
	version_t previous_pn = accepted_pn;
	// 接受對方的accepted_pn
	if (collect->pn > accepted_pn)
		accepted_pn = collect->pn;
		accepted_pn_from = collect->pn_from;
		t->put(get_name(), "accepted_pn", accepted_pn);
		get_store()->apply_transaction(t);
	last->pn = accepted_pn;
	last->pn_from = accepted_pn_from;
	
	if (collect->last_committed < last_committed)
		share_state(last, collect->first_committed, collect->last_committed);
			version_t v = peer_last_committed + 1;
			for ( ; v <= last_committed; v++) 
				get_store()->get(get_name(), v, m->values[v]);
			m->last_committed = last_committed;
	
	if (collect->last_committed <= last_committed && get_store()->exists(get_name(), last_committed+1))
		get_store()->get(get_name(), last_committed+1, bl);
		last->values[last_committed+1] = bl;
		version_t v = get_store()->get(get_name(), "pending_v");
		version_t pn = get_store()->get(get_name(), "pending_pn");
		if (v && pn && v == last_committed + 1)
			last->uncommitted_pn = pn;
		else
			last->uncommitted_pn = previous_pn;
	collect->get_connection()->send_message(last);

副節點收到OP_COLLECT消息後,如果主節點的的last_committed小於自己的,說明主節點缺失部分日誌,就將缺失的這段日誌分享給主節點,如果自己有未確認的日誌,則一併發送給對方。

主節點處理OP_LAST消息

void Paxos::handle_last(MonOpRequestRef op)
	peer_first_committed[from] = last->first_committed;
	peer_last_committed[from] = last->last_committed;
	// 如果對方的日誌更新於本節點,則在本節點持久化缺失的日誌
	need_refresh = store_state(last);
	// 之所以每次副節點的LAST消息到來都要遍歷peer_last_committed,因此每次有新的LAST消息到來都可能會改變
	// 本節點的日誌,所以每次都需要和LAST消息已被接收的節點比較,如果對方的日誌太舊,則更新
	for (map<int,version_t>::iterator p = peer_last_committed.begin(); p != peer_last_committed.end(); ++p) 
		// 對方收到OP_PROBE時,檢測到自己的版本太落後,會bootstrap
		if (p->second + 1 < first_committed && first_committed > 1)
			mon->bootstrap();
			return;
		// 對方的日誌不是太落後,則直接在此更新
		if (p->second < last_committed)
			MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, ceph_clock_now());
			share_state(commit, peer_first_committed[p->first], p->second);
			mon->messenger->send_message(commit, mon->monmap->get_inst(p->first));
	if (last->pn > accepted_pn)
		collect(last->pn);
	else if (last->pn == accepted_pn)
		num_last++;
		if (last->uncommitted_pn)
			if (last->uncommitted_pn >= uncommitted_pn && last->last_committed >= last_committed && last->last_committed + 1 >= uncommitted_v)
				uncommitted_v = last->last_committed+1;
				uncommitted_pn = last->uncommitted_pn;
				uncommitted_value = last->values[uncommitted_v];
		if (num_last == mon->get_quorum().size())
			if (uncommitted_v == last_committed+1 && uncommitted_value.length())
				state = STATE_UPDATING_PREVIOUS;
				begin(uncommitted_value);
			else
				extend_lease();

主節點收到OP_LAST消息後,如果副節點的日誌比自己新,則更新自己的日誌。並遍歷peer_last_committed,對比本節點更新後的日誌和其它節點的日誌,如果其它節點的日誌過於落後(last_committed小於first_committed),本節點重新調用bootstrap,bootstrap會發送OP_PROBE消息,在對方收到OP_PROBE消息時,檢測到自己的日誌太過落後,就會主動調用bootstrap。如果peer_last_committed中其它節點的日誌稍微落後於本節點,就主動將缺失的日誌發送給對方,不需要重新調用bootstrap。

如果副節點的的accepted_pn大於主節點的accepted_pn,則主節點重新collect,會重新生成新的accepted_pn。
如果副節點的accepted_pn等於本節點的accepted_pn,則說明副節點接受的本節點的accepted_pn。判斷副節點是否有未提交併且版本大於主節點未提交的日誌,如果有,則將這個未提交的日誌廣播出去,這是通過正常的paxos過程實現。如果沒有未提交的日誌,就調用extend_lease擴展副本的租約。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章