Ceph Monitor選主（下）

monitor選主的下半段就是真正的選主

發送OP_PROPOSE請求

void Monitor::start_election()
	_reset();
	elector.call_election();	
		start();
			acked_me.clear();
			init();
				epoch = mon->store->get(Monitor::MONITOR_NAME, "election_epoch");
				if (!epoch) 
					epoch = 1;
				// 如果epoch爲奇數，則說明正處在選舉階段，將epoch+1，以跨過這個階段
				else if (epoch % 2)
					++epoch;
					t->put(Monitor::MONITOR_NAME, "election_epoch", epoch);
					mon->store->apply_transaction(t);
			// 如果爲偶數，則加1，表示處於選舉階段
			if (epoch % 2 == 0)
				bump_epoch(epoch+1);
			electing_me = true;
			leader_acked = -1;
			for (unsigned i=0; i<mon->monmap->size(); ++i)
				if ((int)i == mon->rank) continue;
				MMonElection *m = new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap);
				mon->messenger->send_message(m, mon->monmap->get_inst(i));
			reset_timer();

主動啓動選舉的節點向monmap中的其它節點發送OP_PROPOSE請求，並將選舉的epoch加1，置爲奇數。

處理OP_PROPOSE請求

void Elector::handle_propose(MonOpRequestRef op)
	MMonElection *m = static_cast<MMonElection*>(op->get_req());
	int from = m->get_source().num();
	// 對方的選舉版本大於自己
	if (m->epoch > epoch)
		bump_epoch(m->epoch);
			epoch = e;
			t->put(Monitor::MONITOR_NAME, "election_epoch", epoch);
			mon->store->apply_transaction(t);
			mon->join_election();
			electing_me = false;
			acked_me.clear();
	// 對方的選舉版本小於自己
	else if (m->epoch < epoch)
		if (epoch % 2 == 0 && mon->quorum.count(from) == 0)
			mon->start_election();
		else
			dout(5) << " ignoring old propose" << dendl;
			return;
	// 即使選舉，也是本節點贏
	if (mon->rank < from)
		if (leader_acked >= 0)
			assert(leader_acked < from); 
		else
			if (!electing_me)
				mon->start_election();
	else
		if (leader_acked < 0 || leader_acked > from || leader_acked == from)
			// 對方會贏得選舉
			defer(from);
				if (electing_me)
					acked_me.clear();
					electing_me = false;
				// ack them
				leader_acked = who;
				MMonElection *m = new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap);
				m->mon_features = ceph::features::mon::get_supported();
				mon->collect_metadata(&m->metadata);
				mon->messenger->send_message(m, mon->monmap->get_inst(who));
				// set a timer
				reset_timer(1.0); 
		else
			dout(5) << "no, we already acked " << leader_acked << dendl;

其它節點收到OP_PROPOSE請求後，
(1) 如果對方的選舉版本大於自己，則將自己的選舉版本設置爲對方的選舉版本。
(2) 對方的選舉版本小於自己，並且滿足自己不處於選舉階段和對方不處於本節點的quorum緩存裏，則說明對方可能是新加入的節點，這種情況下自己主動開啓選舉，以便讓其加入到quorum。

如果沒有忽略該請求，繼續採取如下行爲：
(1) 如果自己的rank小於對方，則一定不會選舉對方爲主節點，如果這時本節點沒有迴應過其它節點，則自己會發起選舉。
(2) 如果對方的rank小於自己，並且對方的rank小於等於自己已經迴應過的節點，則選舉對方爲主節點。

處理ACK

void Elector::handle_ack(MonOpRequestRef op)
	// 本節點發起的選舉請求，要求選本節點
	if (electing_me) 
		acked_me[from].cluster_features = m->get_connection()->get_features();
		acked_me[from].mon_features = m->mon_features;
		acked_me[from].metadata = m->metadata;
		// 要求monmap中的全部節點都同意我作爲leader纔可以
		if (acked_me.size() == mon->monmap->size()) 
			victory();	
		assert(leader_acked >= 0);

收到OP_ACK後，將回應的節點插入到acked_me中，如果acked_me的大小和monmap的大小相同，則說明全部節點都同意我作爲主節點。

victory()
	bump_epoch(epoch+1);     // is over!  偶數結束
	for (map<int, elector_info_t>::iterator p = acked_me.begin(); p != acked_me.end(); ++p) {
		quorum.insert(p->first);
	for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p) 
		if (*p == mon->rank) continue;
		MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch, mon->monmap);
		m->quorum = quorum;
		mon->messenger->send_message(m, mon->monmap->get_inst(*p));
	mon->win_election(epoch, quorum, cluster_features, mon_features, metadata);
		state = STATE_LEADER;
		leader_since = ceph_clock_now();
		leader = rank;
		quorum = active;
		quorum_con_features = features;
		quorum_mon_features = mon_features;
		pending_metadata = metadata;
		outside_quorum.clear();		
		paxos->leader_init();

(1) 將選舉版本加1，變爲偶數。
(2) 將acked_me的節點賦值給quorum。
(3) 對quorum的每個節點，發送OP_VICTORY消息。
(4) 初始化paxos，主要是提交上次沒有提交的消息。

leader_init	
	pending_proposal.reset();
	state = STATE_RECOVERING;
	lease_expire = utime_t();
	collect(0); // paxos裏的函數
		// 有爲確認的消息
		if (get_store()->exists(get_name(), last_committed+1))
			version_t v = get_store()->get(get_name(), "pending_v");
			version_t pn = get_store()->get(get_name(), "pending_pn");
			if (v && pn && v == last_committed + 1)
				uncommitted_pn = pn;
			else
				uncommitted_pn = accepted_pn;
			uncommitted_v = last_committed+1;
			get_store()->get(get_name(), last_committed+1, uncommitted_value);
		// 生成新的accepted_pn，只有在每次選舉成功才能生成新的
		accepted_pn = get_new_proposal_number(std::max(accepted_pn, oldpn));
		accepted_pn_from = last_committed;
		for (set<int>::const_iterator p = mon->get_quorum().begin(); p != mon->get_quorum().end(); ++p)
			if (*p == mon->rank) continue;
			MMonPaxos *collect = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COLLECT,  ceph_clock_now());
			collect->last_committed = last_committed;
			collect->first_committed = first_committed;
			collect->pn = accepted_pn;
			mon->messenger->send_message(collect, mon->monmap->get_inst(*p));

(1) 如果存在last_committed+1版本的日誌，則說明存在沒有完成同步的消息，則獲取pending_v(未提交日誌的版本號)、pending_pn(爲提交日誌的accepted_pn)、和last_committed+1的日誌
(2) 生成新的accepted_pn，該值只在每次選舉完成後重新生成。
(3) 向quorum其它節點發送OP_COLLECT消息。

副節點處理OP_COLLECT消息

void Paxos::handle_collect(MonOpRequestRef op)
	state = STATE_RECOVERING;
	MMonPaxos *last = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LAST,  ceph_clock_now());
	last->last_committed = last_committed;
	last->first_committed = first_committed;
	version_t previous_pn = accepted_pn;
	// 接受對方的accepted_pn
	if (collect->pn > accepted_pn)
		accepted_pn = collect->pn;
		accepted_pn_from = collect->pn_from;
		t->put(get_name(), "accepted_pn", accepted_pn);
		get_store()->apply_transaction(t);
	last->pn = accepted_pn;
	last->pn_from = accepted_pn_from;
	
	if (collect->last_committed < last_committed)
		share_state(last, collect->first_committed, collect->last_committed);
			version_t v = peer_last_committed + 1;
			for ( ; v <= last_committed; v++) 
				get_store()->get(get_name(), v, m->values[v]);
			m->last_committed = last_committed;
	
	if (collect->last_committed <= last_committed && get_store()->exists(get_name(), last_committed+1))
		get_store()->get(get_name(), last_committed+1, bl);
		last->values[last_committed+1] = bl;
		version_t v = get_store()->get(get_name(), "pending_v");
		version_t pn = get_store()->get(get_name(), "pending_pn");
		if (v && pn && v == last_committed + 1)
			last->uncommitted_pn = pn;
		else
			last->uncommitted_pn = previous_pn;
	collect->get_connection()->send_message(last);

副節點收到OP_COLLECT消息後，如果主節點的的last_committed小於自己的，說明主節點缺失部分日誌，就將缺失的這段日誌分享給主節點，如果自己有未確認的日誌，則一併發送給對方。

主節點處理OP_LAST消息

void Paxos::handle_last(MonOpRequestRef op)
	peer_first_committed[from] = last->first_committed;
	peer_last_committed[from] = last->last_committed;
	// 如果對方的日誌更新於本節點，則在本節點持久化缺失的日誌
	need_refresh = store_state(last);
	// 之所以每次副節點的LAST消息到來都要遍歷peer_last_committed，因此每次有新的LAST消息到來都可能會改變
	// 本節點的日誌，所以每次都需要和LAST消息已被接收的節點比較，如果對方的日誌太舊，則更新
	for (map<int,version_t>::iterator p = peer_last_committed.begin(); p != peer_last_committed.end(); ++p) 
		// 對方收到OP_PROBE時，檢測到自己的版本太落後，會bootstrap
		if (p->second + 1 < first_committed && first_committed > 1)
			mon->bootstrap();
			return;
		// 對方的日誌不是太落後，則直接在此更新
		if (p->second < last_committed)
			MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, ceph_clock_now());
			share_state(commit, peer_first_committed[p->first], p->second);
			mon->messenger->send_message(commit, mon->monmap->get_inst(p->first));
	if (last->pn > accepted_pn)
		collect(last->pn);
	else if (last->pn == accepted_pn)
		num_last++;
		if (last->uncommitted_pn)
			if (last->uncommitted_pn >= uncommitted_pn && last->last_committed >= last_committed && last->last_committed + 1 >= uncommitted_v)
				uncommitted_v = last->last_committed+1;
				uncommitted_pn = last->uncommitted_pn;
				uncommitted_value = last->values[uncommitted_v];
		if (num_last == mon->get_quorum().size())
			if (uncommitted_v == last_committed+1 && uncommitted_value.length())
				state = STATE_UPDATING_PREVIOUS;
				begin(uncommitted_value);
			else
				extend_lease();

主節點收到OP_LAST消息後，如果副節點的日誌比自己新，則更新自己的日誌。並遍歷peer_last_committed，對比本節點更新後的日誌和其它節點的日誌，如果其它節點的日誌過於落後(last_committed小於first_committed)，本節點重新調用bootstrap，bootstrap會發送OP_PROBE消息，在對方收到OP_PROBE消息時，檢測到自己的日誌太過落後，就會主動調用bootstrap。如果peer_last_committed中其它節點的日誌稍微落後於本節點，就主動將缺失的日誌發送給對方，不需要重新調用bootstrap。

如果副節點的的accepted_pn大於主節點的accepted_pn，則主節點重新collect，會重新生成新的accepted_pn。
如果副節點的accepted_pn等於本節點的accepted_pn，則說明副節點接受的本節點的accepted_pn。判斷副節點是否有未提交併且版本大於主節點未提交的日誌，如果有，則將這個未提交的日誌廣播出去，這是通過正常的paxos過程實現。如果沒有未提交的日誌，就調用extend_lease擴展副本的租約。

Ceph Monitor選主（下）

發送OP_PROPOSE請求

處理OP_PROPOSE請求

處理ACK

副節點處理OP_COLLECT消息

主節點處理OP_LAST消息

工作中用到的腳本合集

通過f-string編寫簡潔高效的Python格式化輸出代碼

24-5-18 X

Ceph Monitor選主（下）

Ceph Monitor選主（上）

Linux設備驅動模型——總線、設備、驅動

非連續內存區管理

Linux設備模型初始化——PCI子系統初始化

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結