linux內核arp協議實現詳解

linux內核arp協議實現詳解

 

(neighbour主要用於獲取目標主機的MAC地址,記錄的是ARP表中一條記錄的信息,以下與ARP表項在一定程度上表示一個意思。由於代碼排版問題,部分註釋需要移動滾動條查看。)

1、ARP狀態轉移

1.1 ARP狀態轉移圖

(在收到發給自己的ARP應答時,有些地方沒有做狀態判斷,直接認爲是目的主機是可達狀態。)

1.2 ARP狀態解釋

NUD_INCOMPLETE 未完成狀態(新ARP表項,正在發生ARP請求,未獲取ARP應答)
NUD_REACHABLE 目標可達狀態(收到ARP應答/請求,根據協議可以確定目標主機可達)
NUD_STALE ARP過期狀態(reachable時間超時並且最近使用時間超時,很久時間沒有使用該表項)
NUD_DELAY 延遲狀態(reachable時間超時,但是最近使用時間未超時,該ARP表項可能還可以使用)
NUD_PROBE ARP檢測狀態(ARP很久時間沒有更新且後續有可能使用該表項,需要載檢測ARP表項是否有效)
NUD_FAILED ARP無效(獲取不到目標MAC地址,定時器超時後回收該表項內存空間)

1.3 ARP中的時間

neighbour結構體中含有幾個時間,如下表所示:

confirmed ARP表項最近確認時間,在收到ARP報文等情況下可以確認該表項有效時,才更新,用於判斷該表項是否可能有效
updated ARP狀態最近更新時間,僅是狀態更新,狀態轉換都會更新
used ARP最近使用時間,僅表示該ARP表項最近使用過,在發送數據是更新,用來確定該表項多久沒有使用了

2、ARP源碼解釋

2.1 neighbour結構體

neighbour除了之前介紹的幾個時間成員變量之外,還有arp_queue、timer等;arp_queue用於ARP請求時緩存上層協議需要發送的數據,在ARP獲取到目標MAC地址時發送該數據;timer主要對各種狀態計時;成員變量介紹查看如下注釋:

struct neighbour {
	struct neighbour __rcu	*next;
	struct neigh_table	*tbl;
	struct neigh_parms	*parms;
	unsigned long		confirmed; // ARP表項最近確認時間
	unsigned long		updated; // ARP狀態最近更新時間
	rwlock_t		lock;
	atomic_t		refcnt;
	struct sk_buff_head	arp_queue; // ARP緩存(發送ARP請求時,會把上層協議的數據緩存到arp_queue裏面;除了tcp有超時重傳機制外,其他協議基本沒有重傳機制;緩存上層協議的數據,除了可以在獲取到ARP應答,立即發送發送數據,不用等到超時重傳外,還可以傳送其他協議的數據,避免其他協議不必要的重發(其他協議發送到物理層就認爲數據發送出去了,而不管是否真正發生到網絡中了))
	unsigned int		arp_queue_len_bytes; // 已緩存的數據長度
	struct timer_list	timer; // ARP定時器,通過定時器超時來觸發ARP表項的狀態轉移;發送請求後長時間沒有應答,會重發ARP請求,請求次數超過次數,會將ARP狀態設置爲NUD_FAILED並釋放緩存的數據
	unsigned long		used;
	atomic_t		probes; // ARP probes次數
	__u8			flags;
	__u8			nud_state; // ARP狀態
	__u8			type;
	__u8			dead;
	seqlock_t		ha_lock;
	unsigned char		ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))];
	struct hh_cache		hh; // 網絡頭部緩存;發送ARP請求時會調用eth_header_cache緩存源MAC地址、目的MAC地址(0)
	int			(*output)(struct neighbour *, struct sk_buff *); // 以太網鄰居發現函數neigh_resolve_output
	const struct neigh_ops	*ops;
	struct rcu_head		rcu;
	struct net_device	*dev; //  ARP表項對應的網卡設備
	u8			primary_key[0];
};

2.2 ARP請求發送過程

2.2.1 ip_finish_output

ip報文通過ip_finish_output發送,如下是ip報文發送的過程,網絡層發送到鏈路層再到物理層函數調用順序如下:

ip_finish_output -> ip_finish_output2 -> dst_neigh_output
-> neigh_resolve_output -> neigh_event_send -> __neigh_event_send
-> neigh_probe -> arp_solicit -> arp_send_dst -> arp_xmit_finish
-> dev_queue_xmit -> __dev_queue_xmit -> __dev_xmit_skb
-> sch_direct_xmit -> dev_hard_start_xmit -> xmit_one
-> netdev_start_xmit -> __netdev_start_xmit -> smsc911x_hard_start_xmit

2.2.2 dst_neigh_output

網絡層的數據發送到鏈路層,通過dst_neigh_output添加目的地址、源地址及協議等進行封裝,然後通過網卡發送出去;

dst_neigh_output先檢測鄰居是否處於鏈接狀態,是否已經緩存鏈路層頭,即到目標主機的ARP表是否可用;如果不可用,則調用neigh_resolve_output發送ARP請求,獲取目標主機的物理地址信息(下一跳),函數代碼如下:

static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
				   struct sk_buff *skb)
{
	const struct hh_cache *hh;

	if (dst->pending_confirm) {
		unsigned long now = jiffies;

		dst->pending_confirm = 0;
		/* avoid dirtying neighbour */
		if (n->confirmed != now)
			n->confirmed = now;
	}

	hh = &n->hh;
	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) // 鄰居處於連接狀態(NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE),並且硬件頭部已經緩存,則可以直接調用neigh_hh_output發送報文,此時已經有鄰居(可能是目標主機也可能是網關)的有效硬件地址,相鄰主機依靠網卡物理地址MAC進行傳輸
		return neigh_hh_output(hh, skb);
	else
		return n->output(n, skb); // 鄰居不處於連接狀態,沒有鄰居的硬件頭部緩存,調用neigh_resolve_output發送ARP請求,請求鄰居的MAC地址
}


int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
	int rc = 0;

	if (!neigh_event_send(neigh, skb)) {
		int err;
		struct net_device *dev = neigh->dev;
		unsigned int seq;

		if (dev->header_ops->cache && !neigh->hh.hh_len)
			neigh_hh_init(neigh);

		do {
			__skb_pull(skb, skb_network_offset(skb));
			seq = read_seqbegin(&neigh->ha_lock);
			err = dev_hard_header(skb, dev, ntohs(skb->protocol),
					      neigh->ha, NULL, skb->len);
		} while (read_seqretry(&neigh->ha_lock, seq));

		if (err >= 0)
			rc = dev_queue_xmit(skb); // 通過物理網卡發送ARP報文
		else
			goto out_kfree_skb;
	}
out:
	return rc;
out_kfree_skb:
	rc = -EINVAL;
	kfree_skb(skb);
	goto out;
}

static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
	unsigned long now = jiffies;
	
	if (neigh->used != now)
		neigh->used = now;
	if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE)))
		return __neigh_event_send(neigh, skb);
	return 0;
}

2.2.3 neigh_resolve_output

arp請求通過neigh_resolve_output函數實現,該函數先調用neigh_event_send,neigh_event_send會更新之前講過的used時間,更新neighbour的ARP狀態,如果發送前已經發送過ARP請求之類的報文(發送此報文之前可能有其他ip報文已經發送了,之前的ip報文觸發了ARP請求,之前的ARP請求還在進行中),則不需要重新ARP請求,避免網絡擁塞,如果沒有發送過則立即發送ARP請求。

neigh_resolve_output代碼如下:

int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
	int rc = 0;

	if (!neigh_event_send(neigh, skb)) { // neigh_event_send返回值用於確定當前函數是否需要發送報文
		int err;
		struct net_device *dev = neigh->dev;
		unsigned int seq;

		if (dev->header_ops->cache && !neigh->hh.hh_len)
			neigh_hh_init(neigh);

		do {
			__skb_pull(skb, skb_network_offset(skb));
			seq = read_seqbegin(&neigh->ha_lock);
			err = dev_hard_header(skb, dev, ntohs(skb->protocol),
					      neigh->ha, NULL, skb->len);
		} while (read_seqretry(&neigh->ha_lock, seq));

		if (err >= 0)
			rc = dev_queue_xmit(skb); // 目標MAC地址有效或者可能有效,可以嘗試先用目前的MAC地址發送數據
		else
			goto out_kfree_skb;
	}
out:
	return rc;
out_kfree_skb:
	rc = -EINVAL;
	kfree_skb(skb);
	goto out;
}

__neigh_event_send代碼如下:

int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
	int rc;
	bool immediate_probe = false;

	write_lock_bh(&neigh->lock);

	rc = 0;
	if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) // 處於連接等狀態
    // 表示已經發送過ARP請求或者即將由其他事件觸發發送ARP請求,不需要主動發送ARP請求。
		goto out_unlock_bh;
	if (neigh->dead)
		goto out_dead;

	if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { // 剛申請的ARP表項或者其他需要發送ARP請求的狀態
		if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
		    NEIGH_VAR(neigh->parms, APP_PROBES)) {
			unsigned long next, now = jiffies;

			atomic_set(&neigh->probes,
				   NEIGH_VAR(neigh->parms, UCAST_PROBES));
			neigh->nud_state     = NUD_INCOMPLETE; // 設置狀態爲NUD_INCOMPLETE
			neigh->updated = now; // 設置ARP表項狀態最近更新時間爲當前時間
			next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
					 HZ/2); // 計算本次發送ARP請求超時時間
			neigh_add_timer(neigh, next); // 添加並激活超時定時器,
                                          // 該定時器超時會觸發發送下一次ARP請求
			immediate_probe = true; // 目前沒有已經發送的ARP請求,
                                    // 可以而且應該立即發送ARP請求
		} else {
			neigh->nud_state = NUD_FAILED;
			neigh->updated = jiffies;
			write_unlock_bh(&neigh->lock);

			kfree_skb(skb);
			return 1;
		}
	} else if (neigh->nud_state & NUD_STALE) { // 如果當前ARP表項處於過期狀體
                                               // (很久沒有使用了,網絡環境變了或者主機不存在了,可能已經失效了),
                                               // 將狀態變更爲NUD_DELAY,過段時間再發送ARP請求
                                               // (該ARP表項有可能失效,但是還是有可能有效的,
                                               // 可以先用當前MAC地址發送,即使發送到報文可能到不了目標地址,
                                               // 可過段時間還是會發生ARP請求去獲取正確的目標MAC地址,不影響正常的功能)
		neigh_dbg(2, "neigh %p is delayed\n", neigh);
		neigh->nud_state = NUD_DELAY; // 轉換到NUD_DELAY狀態
		neigh->updated = jiffies; // 設置ARP表項狀態最近更新時間爲jiffies(now)
		neigh_add_timer(neigh, jiffies +
				NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME)); // 添加激活延遲檢測定時器,
                                                            // 超時後會觸發進入NUD_PROBE狀態,由該狀態負責發送ARP請求;
                                                            // 假如在超時前有ARP應答或其他確定ARP表項有效的報文,
                                                            // 則可以進入NUD_REACHABLE狀態,不需要發送ARP請求。
	}

	if (neigh->nud_state == NUD_INCOMPLETE) { // 新申請的ARP表項,第一次發生ARP請求
		if (skb) {
			while (neigh->arp_queue_len_bytes + skb->truesize >
			       NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) {
				struct sk_buff *buff;

				buff = __skb_dequeue(&neigh->arp_queue); // 獲取ARP緩存地址
				if (!buff)
					break;
				neigh->arp_queue_len_bytes -= buff->truesize; // 更新ARP緩存已保存數據大小
				kfree_skb(buff); // 釋放之前的數據(該ARP表項已經過期了,還沒釋放又已經被使用了,之前的數據都是無效的)
				NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
			}
			skb_dst_force(skb);
			__skb_queue_tail(&neigh->arp_queue, skb); // 上層協議數據追加到ARP緩存裏面
			neigh->arp_queue_len_bytes += skb->truesize; // 更新ARP緩存已保存數據大小
		}
		rc = 1; // 返回給調用函數,還沒獲取的目標主機的物理地址,
                // 讓上層函數不需要發送數據到物理網卡(之前有幾個分支可以確定目標MAC地址有效或者可能有效,上層調用函數可以發送數據到網卡或者嘗試發送數據到網卡)
	}
out_unlock_bh:
	if (immediate_probe)
		neigh_probe(neigh); // 之前沒有發生ARP請求報文,這裏立即發送,
                            // 否則沒必要發送(請求報文已經在網絡上傳輸或者對方已經在發送ARP應答報文了,不急着發)
	else
		write_unlock(&neigh->lock);
	local_bh_enable();
	return rc;

out_dead:
	if (neigh->nud_state & NUD_STALE)
		goto out_unlock_bh;
	write_unlock_bh(&neigh->lock);
	kfree_skb(skb);
	return 1;
}

2.3 ARP應答過程

2.3.1 報文接收解析過程

arp_process用於處理收到的ARP報文,包括ARP請求和ARP應答。函數調用省略了中間部分函數,主要過程是網卡中斷觸發irq,irq中斷函數做些簡單的處理然後觸發軟中斷,在軟中斷中調用網卡驅動都報文數據,解析報文;調用關係如下:

smsc911x_poll -> __netif_receive_skb -> arp_rcv -> arp_process

2.3.2 arp_process

arp_process解析ARP報文,判斷報文是否可靠,並更新ARP狀態等,僅是發給自己的應答報文才可靠,才認爲目標是可達的。

arp_process調用neigh_update更新ARP狀態及統計時間,如果之前有已經緩存的報文並且目標主機可達,neigh_update會將緩存數據發送出去。

static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct net_device *dev = skb->dev; // 獲取網卡設備,需要通過具體的網卡設備讀取報文
	struct in_device *in_dev = __in_dev_get_rcu(dev);
	struct arphdr *arp;
	unsigned char *arp_ptr;
	struct rtable *rt;
	unsigned char *sha;
	__be32 sip, tip;
	u16 dev_type = dev->type;
	int addr_type;
	struct neighbour *n;
	struct dst_entry *reply_dst = NULL;
	bool is_garp = false;

	/* arp_rcv below verifies the ARP header and verifies the device
	 * is ARP'able.
	 */

	if (!in_dev)
		goto out;

	arp = arp_hdr(skb); // arp報文頭

	switch (dev_type) {
	default:
		if (arp->ar_pro != htons(ETH_P_IP) ||
		    htons(dev_type) != arp->ar_hrd)
			goto out;
		break;
	case ARPHRD_ETHER:
	case ARPHRD_FDDI:
	case ARPHRD_IEEE802:
		/*
		 * ETHERNET, and Fibre Channel (which are IEEE 802
		 * devices, according to RFC 2625) devices will accept ARP
		 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
		 * This is the case also of FDDI, where the RFC 1390 says that
		 * FDDI devices should accept ARP hardware of (1) Ethernet,
		 * however, to be more robust, we'll accept both 1 (Ethernet)
		 * or 6 (IEEE 802.2)
		 */
		if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
		     arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
		    arp->ar_pro != htons(ETH_P_IP))
			goto out;
		break;
	case ARPHRD_AX25:
		if (arp->ar_pro != htons(AX25_P_IP) ||
		    arp->ar_hrd != htons(ARPHRD_AX25))
			goto out;
		break;
	case ARPHRD_NETROM:
		if (arp->ar_pro != htons(AX25_P_IP) ||
		    arp->ar_hrd != htons(ARPHRD_NETROM))
			goto out;
		break;
	}

	/* Understand only these message types */

	if (arp->ar_op != htons(ARPOP_REPLY) &&
	    arp->ar_op != htons(ARPOP_REQUEST))
		goto out; // 非ARP請求、應答報文,退出

/*
 *	Extract fields
 */
	arp_ptr = (unsigned char *)(arp + 1);
	sha	= arp_ptr;
	arp_ptr += dev->addr_len;
	memcpy(&sip, arp_ptr, 4);
	arp_ptr += 4;
	switch (dev_type) {
#if IS_ENABLED(CONFIG_FIREWIRE_NET)
	case ARPHRD_IEEE1394:
		break;
#endif
	default:
		arp_ptr += dev->addr_len;
	}
	memcpy(&tip, arp_ptr, 4);
/*
 *	Check for bad requests for 127.x.x.x and requests for multicast
 *	addresses.  If this is one such, delete it.
 */
	if (ipv4_is_multicast(tip) ||
	    (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
		goto out;

/*
 *     Special case: We must set Frame Relay source Q.922 address
 */
	if (dev_type == ARPHRD_DLCI)
		sha = dev->broadcast;

/*
 *  Process entry.  The idea here is we want to send a reply if it is a
 *  request for us or if it is a request for someone else that we hold
 *  a proxy for.  We want to add an entry to our cache if it is a reply
 *  to us or if it is a request for our address.
 *  (The assumption for this last is that if someone is requesting our
 *  address, they are probably intending to talk to us, so it saves time
 *  if we cache their address.  Their address is also probably not in
 *  our cache, since ours is not in their cache.)
 *
 *  Putting this another way, we only care about replies if they are to
 *  us, in which case we add them to the cache.  For requests, we care
 *  about those for us and those for our proxies.  We reply to both,
 *  and in the case of requests for us we add the requester to the arp
 *  cache.
 */

	if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb))
		reply_dst = (struct dst_entry *)
			    iptunnel_metadata_reply(skb_metadata_dst(skb),
						    GFP_ATOMIC);

	/* Special case: IPv4 duplicate address detection packet (RFC2131) */
	if (sip == 0) {
		if (arp->ar_op == htons(ARPOP_REQUEST) &&
		    inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL &&
		    !arp_ignore(in_dev, sip, tip))
			arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
				     sha, dev->dev_addr, sha, reply_dst);
		goto out;
	}

	// ARP請求報文,並且有輸入路由(目的地址非廣播地址等不需要處理的地址)
	// 不存在情況下,ip_route_input_noref會創建一條輸入路由,輸入函數爲ip_local_deliver,輸出函數爲ip_rt_bug,
	// 該路由僅輸入使用,用於指示遠程主機到本機ip報文的上一級處理函數
	// (這個似乎可以通過設置輸入函數建立輸入規則,丟棄某些遠程主機的ip,不知道iptables是怎麼實現的...)
	if (arp->ar_op == htons(ARPOP_REQUEST) &&
	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {

		rt = skb_rtable(skb);
		addr_type = rt->rt_type;

		// 本地局域網地址類型
		if (addr_type == RTN_LOCAL) {
			int dont_send;

			dont_send = arp_ignore(in_dev, sip, tip); // 判斷是否需要應答(目標ip是否是本機某個網卡ip等)
			if (!dont_send && IN_DEV_ARPFILTER(in_dev))
				dont_send = arp_filter(sip, tip, dev);
			if (!dont_send) {
				n = neigh_event_ns(&arp_tbl, sha, &sip, dev); // 查找dev網卡到sip的neighbour(外部主機可能可以發送數據到本網卡,但是本網卡不一定能發送數據到外部主機)
				if (n) {
					arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
						     sip, dev, tip, sha,
						     dev->dev_addr, sha,
						     reply_dst); // 本網卡與外部主機是互通的,則發送ARP應答報文
					neigh_release(n);
				}
			}
			goto out;
		} else if (IN_DEV_FORWARD(in_dev)) { // 輸入網卡具有轉發功能(ARP代理或者其他功能)
			if (addr_type == RTN_UNICAST  &&
			    (arp_fwd_proxy(in_dev, dev, rt) ||
			     arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
			     (rt->dst.dev != dev &&
			      pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
				if (n)
					neigh_release(n);

				if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
				    skb->pkt_type == PACKET_HOST ||
				    NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
					arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
						     sip, dev, tip, sha,
						     dev->dev_addr, sha,
						     reply_dst); // 源ip、目的ip互換,發送ARP應答報文(不能使用本機ip應答)
				} else {
					pneigh_enqueue(&arp_tbl,
						       in_dev->arp_parms, skb); // 延遲應答,先緩存數據,proxy_timer定時器超時再發送應答報文
					goto out_free_dst;
				}
				goto out;
			}
		}
	}

	/* Update our ARP tables */

	n = __neigh_lookup(&arp_tbl, &sip, dev, 0); // 找到對應的neighbour,如果不存在則創建新的neighbour

	if (IN_DEV_ARP_ACCEPT(in_dev)) {
		unsigned int addr_type = inet_addr_type_dev_table(net, dev, sip);

		/* Unsolicited ARP is not accepted by default.
		   It is possible, that this option should be enabled for some
		   devices (strip is candidate)
		 */
		is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
			  addr_type == RTN_UNICAST;

		if (!n &&
		    ((arp->ar_op == htons(ARPOP_REPLY)  &&
				addr_type == RTN_UNICAST) || is_garp))
			n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
	}

	if (n) {
		int state = NUD_REACHABLE; // 先默認設置狀態爲NUD_REACHABLE
		int override;

		/* If several different ARP replies follows back-to-back,
		   use the FIRST one. It is possible, if several proxy
		   agents are active. Taking the first reply prevents
		   arp trashing and chooses the fastest router.
		 */
		override = time_after(jiffies,
				      n->updated +
				      NEIGH_VAR(n->parms, LOCKTIME)) ||
			   is_garp; // 更新時間太老,之前的數據很可能無效了,需要覆蓋ARP表項,當前獲取到的ARP信息更可靠些

		/* Broadcast replies and request packets
		   do not assert neighbour reachability.
		 */
		if (arp->ar_op != htons(ARPOP_REPLY) ||
		    skb->pkt_type != PACKET_HOST)
			state = NUD_STALE; // 不是ARP應答報文(ARP請求報文),該報文不一定可靠;不是發給自己的報文,該報文也不一定可靠;
            // 總而言之有可能需要更新爲NUD_STALE過期狀態,下次要用該ARP表項的時候,暫時可以用該表項,同時經過一系列狀態轉換觸發ARP probe操作去獲取最新的ARP信息,
            // probe之後得到的應答報文再調用目前分支,只不過是發給本機而且是應答的可靠的報文
		neigh_update(n, sha, state,
			     override ? NEIGH_UPDATE_F_OVERRIDE : 0);
		neigh_release(n); // 判斷該ARP表項是否可以釋放(沒有被使用了纔可以釋放)
	}

out:
	consume_skb(skb);
out_free_dst:
	dst_release(reply_dst);
	return 0;
}

2.3.3 neigh_update

neigh_update更新ARP表項狀態及物理地址信息,獲取到的物理地址有可能前後不一致,代碼註釋如下:

int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
		 u32 flags)
{
	u8 old;
	int err;
	int notify = 0;
	struct net_device *dev;
	int update_isrouter = 0;

	write_lock_bh(&neigh->lock);

	dev    = neigh->dev;
	old    = neigh->nud_state;
	err    = -EPERM;

	if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
	    (old & (NUD_NOARP | NUD_PERMANENT)))
		goto out;
	if (neigh->dead)
		goto out;

	if (!(new & NUD_VALID)) {
		neigh_del_timer(neigh);
		if (old & NUD_CONNECTED)
			neigh_suspect(neigh);
		neigh->nud_state = new;
		err = 0;
		notify = old & NUD_VALID;
		if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
		    (new & NUD_FAILED)) {
			neigh_invalidate(neigh);
			notify = 1;
		}
		goto out;
	}

	/* Compare new lladdr with cached one */
	if (!dev->addr_len) {
		/* First case: device needs no address. */
		lladdr = neigh->ha;
	} else if (lladdr) {
		/* The second case: if something is already cached
		   and a new address is proposed:
		   - compare new & old
		   - if they are different, check override flag
		 */
		if ((old & NUD_VALID) &&
		    !memcmp(lladdr, neigh->ha, dev->addr_len)) // 前後獲取的物理地址一樣
			lladdr = neigh->ha;
	} else {
		/* No address is supplied; if we know something,
		   use it, otherwise discard the request.
		 */
		err = -EINVAL;
		if (!(old & NUD_VALID))
			goto out;
		lladdr = neigh->ha; // 使用ha地址(之前的物理地址)
	}

	if (new & NUD_CONNECTED)
		neigh->confirmed = jiffies; // 更新confirmed時間
	neigh->updated = jiffies;

	/* If entry was valid and address is not changed,
	   do not change entry state, if new one is STALE.
	 */
	err = 0;
	update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
	if (old & NUD_VALID) {
		if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) {
			update_isrouter = 0;
			if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) &&
			    (old & NUD_CONNECTED)) {
				lladdr = neigh->ha; // 之前目標地址可達,先用之前的地址替換當前的物理地址
				new = NUD_STALE; // 使ARP表項轉換到過期狀態(網絡環境可能變換了),下次發送數據時經過一段延遲會觸發probe,probe之前還是用舊的物理地址
			} else
				goto out; // 前後獲取的物理地址不一樣,非強制覆蓋,則退出,不接受新地址
		} else {
			if (lladdr == neigh->ha && new == NUD_STALE &&
			    ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) ||
			     (old & NUD_CONNECTED))
			    )
				new = old; // 報文不是發給自己的,很久沒有更新,上層函數認爲ARP表項過期了,之前處於NUD_CONNECTED狀態,且前後獲取到的物理地址一樣;
                           // 推翻上層函數的判斷,用舊狀態替代新狀態,保持不變
		}
	}

	if (new != old) {
		neigh_del_timer(neigh); // 刪除舊的定時器
		if (new & NUD_PROBE)
			atomic_set(&neigh->probes, 0); // 由其他狀態進入NUD_PROBE狀態,恢復probes爲0
		if (new & NUD_IN_TIMER)
			neigh_add_timer(neigh, (jiffies +
						((new & NUD_REACHABLE) ?
						 neigh->parms->reachable_time :
						 0))); // 啓動新的定時器
		neigh->nud_state = new;
		notify = 1;
	}

	if (lladdr != neigh->ha) {
		write_seqlock(&neigh->ha_lock);
		memcpy(&neigh->ha, lladdr, dev->addr_len); // 前後獲取的物理地址不一樣,新地址複製到舊地址裏面(前面已經對不一致地址的情況判斷了,不需要更新情況跳轉到out了)
		write_sequnlock(&neigh->ha_lock);
		neigh_update_hhs(neigh); // 更新緩存
		if (!(new & NUD_CONNECTED)) // 前面處理的時候判定物理地址可能過期會設置new爲非NUD_CONNECTED狀態,將confirmed的時間提前在定時超時時會使狀態進入NUD_PROBE狀態
			neigh->confirmed = jiffies -
				      (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);
		notify = 1;
	}
	if (new == old)
		goto out; // 前後狀態沒有變化,不需要處理
	if (new & NUD_CONNECTED)
		neigh_connect(neigh); // 狀態由其他狀態變爲NUD_CONNECTED,可以直接發送報文,不需要再發ARP報文
	else
		neigh_suspect(neigh);
	if (!(old & NUD_VALID)) { // 之前ARP表項是無效狀態
		struct sk_buff *skb;

		/* Again: avoid dead loop if something went wrong */

		while (neigh->nud_state & NUD_VALID &&
		       (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
            // 從無效狀態變爲有效狀態
			struct dst_entry *dst = skb_dst(skb);
			struct neighbour *n2, *n1 = neigh;
			write_unlock_bh(&neigh->lock);

			rcu_read_lock();

			/* Why not just use 'neigh' as-is?  The problem is that
			 * things such as shaper, eql, and sch_teql can end up
			 * using alternative, different, neigh objects to output
			 * the packet in the output path.  So what we need to do
			 * here is re-lookup the top-level neigh in the path so
			 * we can reinject the packet there.
			 */
			n2 = NULL;
			if (dst) {
				n2 = dst_neigh_lookup_skb(dst, skb); // 通過dst查找neighbour
				if (n2)
					n1 = n2; // 使用dst查找到的neighbour替換之前的neighbour,正常情況查找到的neighbour應該是一樣的(根據英文註釋shaper、eql、sch_teql情況下可能出現不一樣的neighbour)
			}
			n1->output(n1, skb); // 報文發送
			if (n2)
				neigh_release(n2);
			rcu_read_unlock();

			write_lock_bh(&neigh->lock);
		}
		__skb_queue_purge(&neigh->arp_queue);
		neigh->arp_queue_len_bytes = 0;
	}
out:
	if (update_isrouter) {
		neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
			(neigh->flags | NTF_ROUTER) :
			(neigh->flags & ~NTF_ROUTER);
	}
	write_unlock_bh(&neigh->lock);

	if (notify)
		neigh_update_notify(neigh);

	return err;
}

 

2.4 ARP定時器

2.4.1 run_timer_softirq

ARP狀態更新都會設置定時器,同時定時也會觸發ARP狀態的改變,目的就是避免頻繁發送ARP報文並保持ARP數據有效,即使ARP數據過期也儘可能在一定機制下獲取有效的ARP數據。

ARP定時器處理函數爲neigh_timer_handler,定時器使用系統時鐘中斷處理,在時鐘中斷裏面計數,對超時的定時器調用相應的處理函數,調用過程主要如下:

gic_handle_irq -> irq_exit -> run_timer_softirq -> call_timer_fn
-> neigh_timer_handler

2.4.2 neigh_timer_handler

在ARP表項更新時通常都會激活定時器,有些狀態需要重傳數據,有些狀態僅起延遲作用(從一個狀態到另外一個狀態的過度),有些狀態則需要判斷自己的狀態是否有效,最終都是通過neigh_timer_handler來處理;neigh_timer_handler主要根據當前狀態及之前介紹的幾個時間變量來決策該進入哪一個狀態,可以參考前面的狀態圖理解該函數,代碼及解釋如下:

static void neigh_timer_handler(unsigned long arg)
{
	unsigned long now, next;
	struct neighbour *neigh = (struct neighbour *)arg;
	unsigned int state;
	int notify = 0;

	write_lock(&neigh->lock);

	state = neigh->nud_state; // 獲取ARP表項的狀態
	now = jiffies;
	next = now + HZ;

	if (!(state & NUD_IN_TIMER))
		goto out;

	if (state & NUD_REACHABLE) {
        // 檢查NUD_REACHABLE是否超時,判斷標準是confirmed,最近確認ARP表項有效的時間,
        // 如果沒有超時,則設置下一次定時器超時的時間
		if (time_before_eq(now,
				   neigh->confirmed + neigh->parms->reachable_time)) {
			neigh_dbg(2, "neigh %p is still alive\n", neigh);
			next = neigh->confirmed + neigh->parms->reachable_time;
		} else if (time_before_eq(now,
					  neigh->used +
					  NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
            // 如果NUD_REACHABLE狀態超時,ARP目標地址可能不可達,
            // 並且最近使用該ARP表項的時間不是很遠,說明程序會再次使用該表項發送數據的可能性比較大,
            // 有必要確保ARP表項對應主機可達,則轉換爲NUD_DELAY狀態,
            // 該狀態只是延時一段時間,在這段時間內沒有其他事件觸發ARP表項狀態改變,
            // 則觸發probe,相當於先等一段時間再去檢測ARP表項目標地址是否可達
			neigh_dbg(2, "neigh %p is delayed\n", neigh);
			neigh->nud_state = NUD_DELAY;
			neigh->updated = jiffies;
			neigh_suspect(neigh);
			next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
		} else {
            // NUD_REACHABLE已經很久沒有更新了,而且很久沒有使用該ARP表項了,
            // 後面使用該ARP表項去與目標主機通信的概率不大,可將狀態變爲NUD_STALE過期,
            // 注意,這個分支沒有重新啓動定時器,除了前面狀態圖中的事件外,這個狀態幾乎不會改變及維護,
            // 沒有必要維護基本不使用的ARP表項,除非沒有內存可用,否則可以留着後面再用
			neigh_dbg(2, "neigh %p is suspected\n", neigh);
			neigh->nud_state = NUD_STALE;
			neigh->updated = jiffies;
			neigh_suspect(neigh);
			notify = 1;
		}
	} else if (state & NUD_DELAY) {
		if (time_before_eq(now,
				   neigh->confirmed +
				   NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
            // 之前的confirmed時間很久,所以會進入NUD_DELAY狀態,但是有其他報文會更新confirmed,從而可以確定目標主機可達,因此此次需要再判斷confirmed時間
			neigh_dbg(2, "neigh %p is now reachable\n", neigh);
			neigh->nud_state = NUD_REACHABLE; // 最近有更新confirmed且沒有超時,不需要觸發probe,可以確定目標主機可達
			neigh->updated = jiffies;
			neigh_connect(neigh); // 可以發送數據,更改neighbour的輸出函數(目的不可達時是鄰居發現函數)
			notify = 1;
			next = neigh->confirmed + neigh->parms->reachable_time;
		} else {
			neigh_dbg(2, "neigh %p is probed\n", neigh);
			neigh->nud_state = NUD_PROBE;
			neigh->updated = jiffies;
			atomic_set(&neigh->probes, 0);
			notify = 1;
			next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
		}
	} else {
		/* NUD_PROBE|NUD_INCOMPLETE */
		next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); // NUD_PROBE|NUD_INCOMPLETE狀態只更新定時器,接着計數
	}

	if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
	    atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
		neigh->nud_state = NUD_FAILED; // ARP請求檢測很久沒更新了,沒有得到應答或獲取到目標MAC地址,可以認爲目標主機不可達,
        // 設置當前ARP表項爲NUD_FAILED狀態,由其他定時器超時後,回收該內存
		notify = 1;
		neigh_invalidate(neigh);
		goto out;
	}

	if (neigh->nud_state & NUD_IN_TIMER) {
        // 有些狀態不需要定時器,不需要繼續維護,如前面的過期狀態,這裏僅更新需要維護狀態的定時器
		if (time_before(next, jiffies + HZ/2))
			next = jiffies + HZ/2;
		if (!mod_timer(&neigh->timer, next))
			neigh_hold(neigh);
	}
	if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
		neigh_probe(neigh); // ARP請求檢測超時了,繼續請求檢測
	} else {
out:
		write_unlock(&neigh->lock);
	}

	if (notify)
		neigh_update_notify(neigh);

	neigh_release(neigh); // 釋放neighbour(沒有被使用時可以釋放)
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章