1、概述

在NAPI之前，網卡每收到一個包就會觸發一箇中斷通知cpu讀取數據包，當數據包比較多時，中斷數過多勢必影響cpu性能，因此Linux引入NAPI機制，NAPI就是在收到中斷後，先將網卡收包模式切換成poll模式，等收包完成後重新進入中斷模式，本節主要分析Linux的NAPI實現機制。

NAPI的主要流程如下圖，物理網卡收到包後觸發irq中斷通知cpu（觸發中斷後，默認disable該中斷），中斷上半部處理裏將網卡設備的napi->poll_list加入到softnet_data->poll_list，然後觸發rx軟中斷，軟中斷處理函數在通過napi_poll方法調用設備自己的poll函數（ixbge_poll）。

在NAPI模式下，系統會爲軟中斷線程及napi各分配一個額度值（軟中斷的額度爲netdev_budget，默認值是300，所有napi共用；每個napi的額度是weight_p，默認值是64），在一次poll流程裏，ixgbe_poll每接收一個報文就消耗一個額度，如果ixgbe_poll消耗的額度爲napi的額度，說明此時網卡收到的報文比較多，因此需要繼續下一次poll，每次napi_poll消耗的額度會累加，當超過軟中斷線程的額度時，退出本次軟中斷處理流程；當ixgbe_poll消耗的額度沒有達到napi的額度時，說明網卡報文不多，因此重新開啓隊列中斷，進入中斷模式。

2、詳細流程分析

ixgbe_msix_clean_rings

驅動註冊msi中斷處理函數入口爲ixgbe_msix_clean_rings，當網卡觸發irq中斷時，進入ixgbe_msix_clean_rings；

static irqreturn_t ixgbe_msix_clean_rings(int irq, void *data)
{
	struct ixgbe_q_vector *q_vector = data;

	/* EIAM disabled interrupts (on this vector) for us */

	if (q_vector->rx.ring || q_vector->tx.ring)
		napi_schedule_irqoff(&q_vector->napi);

	return IRQ_HANDLED;
}

中斷處理函數最終調用napi_scheduler，主napi_scheduler將napi->poll_list加入到sd->poll_list，然後觸發rx軟中斷

static inline void ____napi_schedule(struct softnet_data *sd,
				     struct napi_struct *napi)
{
	list_add_tail(&napi->poll_list, &sd->poll_list);
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

net_rx_action

中斷流程觸發軟中斷後，結束中斷上半部，進入中斷下半部處理流程，rx軟中斷處理函數爲net_rx_action，在net_rx_action裏，首先爲軟中斷處理過程分配額度（netdev_budget：600），然後調用napi_poll，napi_poll每次使用的額度值累加，如果超過netdev_budget或者napi_poll超過2個tick週期，則退出軟中斷過程，退出之前將napi->poll_list重新加入到sd->poll_list，等待下一次調度。

static void net_rx_action(struct softirq_action *h)
{
	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
	unsigned long time_limit = jiffies + 2;
	//一次軟中斷流程處理的配額
	int budget = netdev_budget;
	LIST_HEAD(list);
	LIST_HEAD(repoll);

	local_irq_disable();
	list_splice_init(&sd->poll_list, &list);
	local_irq_enable();

	for (;;) {
		struct napi_struct *n;

		if (list_empty(&list)) {
			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
				return;
			break;
		}

		n = list_first_entry(&list, struct napi_struct, poll_list);
		budget -= napi_poll(n, &repoll);

		/* If softirq window is exhausted then punt.
		 * Allow this to run for 2 jiffies since which will allow
		 * an average latency of 1.5/HZ.
		 */
		//如果軟中斷的配額用完，或者poll的時間超過2個tick，則退出軟中斷處理流程
		if (unlikely(budget <= 0 ||
			     time_after_eq(jiffies, time_limit))) {
			sd->time_squeeze++;
			break;
		}
	}

	__kfree_skb_flush();
	local_irq_disable();

	//把這個napi重新加到sd->poll_list頭部，等待下次軟中斷再次poll
	list_splice_tail_init(&sd->poll_list, &list);
	list_splice_tail(&repoll, &list);
	list_splice(&list, &sd->poll_list);
	if (!list_empty(&sd->poll_list))
		//如果poll_list不爲空，則再次觸發軟中斷
		__raise_softirq_irqoff(NET_RX_SOFTIRQ);

	net_rps_action_and_irq_enable(sd);
}

napi_poll

napi_poll主要是調用設備自己的poll函數，如ixgbe_poll，每次napi_poll也有自己的額度（weight_p：64）；ixgbe_poll返回設備本次調用使用的額度，在napi_poll的入口首先把napi->poll_list從鏈表裏移除，然後根據ixgbe_poll返回的已使用的額度決定是否將napi_poll重新加入到repoll鏈表。

如果本次ixgbe_poll額度沒有用完（這種情況在ixgbe_poll裏會把poll到的消息全部上送協議棧，並重新進入中斷模式），則napi_poll無需重新加入repoll；如果額度用完，說明網卡還有消息包需要處理，如果開啓gro，napi_poll先將gro_skb->age超過1個tick的優先上送協議棧，然後把napi_poll重新加入到repoll，napi_poll返回到net_rx_action後，net_rx_action會將repoll鏈表重新整合到sd->poll_list，在退出net_rx_action時再次判斷sd->poll_list是否爲空，如果不爲空，則繼續觸發rx軟中斷。

static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
	void *have;
	int work, weight;

	//先將napi->poll_list刪除
	list_del_init(&n->poll_list);

	have = netpoll_poll_lock(n);

	//一次napi poll的配額
	weight = n->weight;

	/* This NAPI_STATE_SCHED test is for avoiding a race
	 * with netpoll's poll_napi().  Only the entity which
	 * obtains the lock and sees NAPI_STATE_SCHED set will
	 * actually make the ->poll() call.  Therefore we avoid
	 * accidentally calling ->poll() when NAPI is not scheduled.
	 */
	work = 0;
	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
		work = n->poll(n, weight);
		trace_napi_poll(n);
	}

	WARN_ON_ONCE(work > weight);

	//本次napi poll的配額沒有用完，進入下一循環
	if (likely(work < weight))
		goto out_unlock;

	/* Drivers must not modify the NAPI state if they
	 * consume the entire weight.  In such cases this code
	 * still "owns" the NAPI instance and therefore can
	 * move the instance around on the list at-will.
	 */
	if (unlikely(napi_disable_pending(n))) {
		napi_complete(n);
		goto out_unlock;
	}

	//本次配額全部用完， 將gro鏈表的age超過一個tick週期的skb上送協議棧
	if (n->gro_list) {
		/* flush too old packets
		 * If HZ < 1000, flush all packets.
		 */
		napi_gro_flush(n, HZ >= 1000);
	}

	/* Some drivers may have called napi_schedule
	 * prior to exhausting their budget.
	 */
	if (unlikely(!list_empty(&n->poll_list))) {
		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
			     n->dev ? n->dev->name : "backlog");
		goto out_unlock;
	}

	//如果本次額度用完，還需要繼續poll，則將napi->poll_list重新加會到repoll
	list_add_tail(&n->poll_list, repoll);

out_unlock:
	netpoll_poll_unlock(have);

	return work;
}

ixgbe_poll

ixgbe_poll裏將napi分配的額度按rx隊列數均分，然後每個rx隊列輪詢去收包，如果有一個rx隊列額度值用完，則標記本次poll還未完成；

int ixgbe_poll(struct napi_struct *napi, int budget)
{
	struct ixgbe_q_vector *q_vector =
				container_of(napi, struct ixgbe_q_vector, napi);
	struct ixgbe_adapter *adapter = q_vector->adapter;
	struct ixgbe_ring *ring;
	int per_ring_budget, work_done = 0;
	bool clean_complete = true;

#ifdef CONFIG_IXGBE_DCA
	if (adapter->flags & IXGBE_FLAG_DCA_ENABLED)
		ixgbe_update_dca(q_vector);
#endif

	ixgbe_for_each_ring(ring, q_vector->tx) {
		if (!ixgbe_clean_tx_irq(q_vector, ring, budget))
			clean_complete = false;
	}

	/* Exit if we are called by netpoll or busy polling is active */
	if ((budget <= 0) || !ixgbe_qv_lock_napi(q_vector))
		return budget;

	/* attempt to distribute budget to each queue fairly, but don't allow
	 * the budget to go below 1 because we'll exit polling */
	//將配額數按rx隊列數均分
	if (q_vector->rx.count > 1)
		per_ring_budget = max(budget/q_vector->rx.count, 1);
	else
		per_ring_budget = budget;

	ixgbe_for_each_ring(ring, q_vector->rx) {
		int cleaned = ixgbe_clean_rx_irq(q_vector, ring,
						 per_ring_budget);

		work_done += cleaned;
		//如果有ring的配額用完，則標記clean_complete爲True
		if (cleaned >= per_ring_budget)
			clean_complete = false;
	}

	ixgbe_qv_unlock_napi(q_vector);
	/* If all work not completed, return budget and keep polling */
	
	//如果有napi分配給rx隊列的配額用完了，說明還有接收包需要繼續處理，因此clean還未結束，返回到napi_poll，
	//napi_poll裏會對gro_list鏈表裏age超過1個tick的skb，先上送協議棧，避免消息包延時太多，並將napi->poll_list
	//重新加入到repoll鏈表，軟中斷處理函數退出本次流程前會將repoll重新加入sd->poll_list，並重新觸發軟中斷
	if (!clean_complete)
		return budget;

	//所有的rx隊列的額度都沒有用完，說明沒有消息包需要再處理了，強制將gro_list的skb全部上送協議棧
	/* all work done, exit the polling mode */
	napi_complete_done(napi, work_done);
	if (adapter->rx_itr_setting & 1)
		ixgbe_set_itr(q_vector);
	if (!test_bit(__IXGBE_DOWN, &adapter->state))
		//重新開啓rx隊列中斷
		ixgbe_irq_enable_queues(adapter, BIT_ULL(q_vector->v_idx));

	return min(work_done, budget - 1);
}

如果所有隊列的額度值都沒用完，則進入napi_complete_done流程，如果有開啓gro，則將gro_skb全部上送協議棧，處理完成後通過ixgbe_irq_enable_queues重新是能rx隊列中斷，進入中斷收包模式。

void napi_complete_done(struct napi_struct *n, int work_done)
{
	unsigned long flags;

	/*
	 * don't let napi dequeue from the cpu poll list
	 * just in case its running on a different cpu
	 */
	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
		return;

	if (n->gro_list) {
		unsigned long timeout = 0;

		if (work_done)
			timeout = n->dev->gro_flush_timeout;

		//timeout默認爲0，因此這裏將gro skb全部上送協議棧
		if (timeout && NAPI_STRUCT_HAS(n, timer))
			hrtimer_start(&n->timer, ns_to_ktime(timeout),
				      HRTIMER_MODE_REL_PINNED);
		else
			napi_gro_flush(n, false);
	}
	if (likely(list_empty(&n->poll_list))) {
		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
	} else {
		/* If n->poll_list is not empty, we need to mask irqs */
		local_irq_save(flags);
		//將napi->poll_list從sd->poll_list移除，清楚napi的scheded狀態
		__napi_complete(n);
		local_irq_restore(flags);
	}
}

3、遺留點

收包模式從poll切換到中斷模式的時機

1、poll模式下，消息包都處理完畢，主動切入中斷模式；

2、????

Linux NAPI機制分析

1、概述

2、詳細流程分析

ixgbe_msix_clean_rings

net_rx_action

napi_poll

ixgbe_poll

3、遺留點

收包模式從poll切換到中斷模式的時機

【簡寫Mybatis-02】註冊機的實現以及SqlSession處理

手繪二維碼

.NET藉助虛擬網卡實現一個簡單異地組網工具

VFIO IOMMU UIO ...

協議發送流程

IOMMU之Interrupt Remapping

Linux TSO流程分析

dpdk添加設備基本流程

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結