tcp_4v_recv

 

假期結束, 根據上次講的 ip_local_deliver

int ip_local_deliver(struct sk_buff *skb)
{
	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
		if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))/*還原包*/
			return 0;
	}

	return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
		       ip_local_deliver_finish);
}

 

 

static int ip_local_deliver_finish(struct sk_buff *skb)
{
	struct net *net = dev_net(skb->dev);

	__skb_pull(skb, ip_hdrlen(skb)); /*去除L3的頭部*/

	/* Point into the IP datagram, just past the header. */
	skb_reset_transport_header(skb);

	rcu_read_lock();
	{
		int protocol = ip_hdr(skb)->protocol;
		int hash, raw;
		const struct net_protocol *ipprot;

	resubmit:
		raw = raw_local_deliver(skb, protocol);/*先給raw handler處理如果有的話*/
//.....

		hash = protocol & (MAX_INET_PROTOS - 1);
					ret = ipprot->handler(skb);/*關鍵是這裏 TCP:tcp_v4_rcv*/
//.....

			kfree_skb(skb);
		}
	}
 out:
	rcu_read_unlock();

	return 0;
}

 

 

這裏在tcp4 註冊的就是  tcp_v4_rcv(struct sk_buff *skb)

下面我們就來看看今天的主角

 

 

int tcp_v4_rcv(struct sk_buff *skb)
{
		const struct iphdr *iph;
		struct tcphdr *th;
		struct sock *sk;
		int ret;
		struct net *net = dev_net(skb->dev);
	
		if (skb->pkt_type != PACKET_HOST)/*檢查屬主*/
			goto discard_it;
	
		/* Count it even if it's bad */
		TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);/*snmp:oid 之前說過了*/
	
		if (!pskb_may_pull(skb, sizeof(struct tcphdr)))/*頭部>=存放的結構體就丟棄*/
			goto discard_it;
	
		th = tcp_hdr(skb);
		/**//*首部4bit 且最多可以60表示字節  所以/4*/
		if (th->doff < sizeof(struct tcphdr) / 4)
			goto bad_packet;
		if (!pskb_may_pull(skb, th->doff * 4))
			goto discard_it;
	
		/* An explanation is required here, I think.
		 * Packet length and doff are validated by header prediction,
		 * provided case of th->doff==0 is eliminated.
		 * So, we defer the checks. *//*校驗和*/
		if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
			goto bad_packet;
	
		th = tcp_hdr(skb);/*下面就是把頭部信息保存在sk_buff的那個48字節cb中*/
		iph = ip_hdr(skb);
		TCP_SKB_CB(skb)->seq = ntohl(th->seq);
		TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
						skb->len - th->doff * 4);/*期待的序號,就是下一次的序號*/
		TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);/*確認序號*/
		TCP_SKB_CB(skb)->when	 = 0;/*通告窗口*/
		TCP_SKB_CB(skb)->flags	 = iph->tos;/*tos居然也到L4了。。*/
		TCP_SKB_CB(skb)->sacked  = 0;
				/*找到鏈路對於的sturct sock 後面再看__inet_lookup_established*/
		sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
		if (!sk)
			goto no_tcp_socket;
	
	process:
		if (sk->sk_state == TCP_TIME_WAIT)/*鏈路已經進入time-wait狀態,收到的包也沒啥意思了*/
			goto do_time_wait;
		/*用戶通過do_ip_setsockopt 設置了IP_MINTTL 比包裏面的TTL要大 說明他不喜歡這個包~*/
		if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
			NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
			goto discard_and_relse;
		}
		/*又見安全框架 檢查策略*/
		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
			goto discard_and_relse;
		nf_reset(skb);
		/*一個狀態機 ,給上層tcmpdump這樣的程序用的
		sock_setsockopt:SO_ATTACH_FILTER*/
		if (sk_filter(sk, skb))
			goto discard_and_relse;
	
		skb->dev = NULL;
		/*禁止軟中斷 因爲涉及到DMA操作 */
		bh_lock_sock_nested(sk);
		ret = 0;/*這裏是關鍵 ,如果當前sock 沒被用戶鎖定<一些syscall的副作用>*/
		if (!sock_owned_by_user(sk)) {/*就先用prequeue處理<mostly>*/
#ifdef CONFIG_NET_DMA
			struct tcp_sock *tp = tcp_sk(sk);
			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
				tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
			if (tp->ucopy.dma_chan)
				ret = tcp_v4_do_rcv(sk, skb);
			else
#endif
			{	/*這個函數下面看 挺有意思*/
				if (!tcp_prequeue(sk, skb))
					ret = tcp_v4_do_rcv(sk, skb);/*這個函數 關鍵流程 後面分析。。*/
			}
		} else if (unlikely(sk_add_backlog(sk, skb))) {/*除了ucopy.prequeue;當然還有一條sk_backlog*/
			bh_unlock_sock(sk);
			NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
			goto discard_and_relse;
		}
		bh_unlock_sock(sk);
	
		sock_put(sk);
	
		return ret;
	
	no_tcp_socket:
		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
			goto discard_it;
	
		if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
	bad_packet:
			TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
		} else {
			tcp_v4_send_reset(NULL, skb);/*因爲會很乾淨 這個函數後面說*/
		}
	
	discard_it:
		/* Discard frame. */
		kfree_skb(skb);
		return 0;
	
	discard_and_relse:
		sock_put(sk);
		goto discard_it;
	
	do_time_wait:
		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
			inet_twsk_put(inet_twsk(sk));
			goto discard_it;
		}
	
		if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
			TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
			inet_twsk_put(inet_twsk(sk));
			goto discard_it;
		}/*關鍵就是這個函數 對於處在time_wait時各個狀態的處理 你可以先跳到後面看完這個函數*/
		switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
		case TCP_TW_SYN: {
			struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),/*接受這個鏈接 函數後面講*/
								&tcp_hashinfo,
								iph->daddr, th->dest,
								inet_iif(skb));
			if (sk2) {
				inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
				inet_twsk_put(inet_twsk(sk));
				sk = sk2;
				goto process;/*這條鏈路又去處理包了*/
			}
			/* Fall through to ACK */
		}
		case TCP_TW_ACK:
			tcp_v4_timewait_ack(sk, skb);/*給予ACK確認*/
			break;
		case TCP_TW_RST:
			goto no_tcp_socket;/*我很喜歡這種情況*/
		case TCP_TW_SUCCESS:;
		}
		goto discard_it;
	}

 

歡迎你先過來看 tcp_timewait_state_process

 

先說幾句, 你在看源碼的過程中如果看到net_twsk_put()調用 你應該十分高興,因爲time_wait sock被真正關閉不佔kernel memory了

 

同樣如果你看到return TCP_TW_SUCCESS 也應該很高興,因爲一切都到盡頭了。 如果是別的可能就分支下去了

 

 先只要瞭解: 更詳細的看後面分析

enum tcp_tw_status {
TCP_TW_SUCCESS = 0,//延遲的報文段或者重複的ACK, 只要丟棄就行了
TCP_TW_RST = 1,//給予一個RST
TCP_TW_ACK = 2,//收到的包,發送一個ACK吧
TCP_TW_SYN = 3//重新打開鏈接,看下文分析
};

 

還有一個函數

寫道
inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
TCP_TIMEWAIT_LEN);

 

用來調度time_wait的存活時間 這個有下面說

 

enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
			   const struct tcphdr *th)
{
	struct tcp_options_received tmp_opt;
	u8 *hash_location;/*在inet_timewait_sock 最簡time_wait sock基礎上的擴展*/
	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
	int paws_reject = 0;

	tmp_opt.saw_tstamp = 0;/*有額外的TCP選項*/
	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
		tcp_parse_options(skb, &tmp_opt, &hash_location, 0);

		if (tmp_opt.saw_tstamp) {
			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;
			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);/*根據paws算法中判斷包時效性*/
		}
	}
	/*很明顯在等對方FIN ,有經驗的都應該知道對方的程序多半爛在CLOS_WAIT*/
	if (tw->tw_substate == TCP_FIN_WAIT2) {
		/* Just repeat all the checks of tcp_rcv_state_process() */

		/* caseF.1: send ACK 如果包的seq不在滑動窗口內 就直接發ACK<僅僅給於確認> over~*/
		if (paws_reject ||
		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
				   tcptw->tw_rcv_nxt,
				   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
			return TCP_TW_ACK;

		if (th->rst)/*case F.2: 如果是rst最happy! ps:記得我以前hp-unix下面用過rst這招*/
			goto kill;
		/*case F.3:這個時這是一個帶着SYN新包。根據協議當然RST回敬(記住所有RST情況很有用)*/
		if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
			goto kill_with_rst;

		/* Dup ACK? */
		if (!th->ack ||/*case F.4:如果不是ACK,或者這個包已經被收過了(seq) 都沒什麼意義 over~*/
		    !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
			inet_twsk_put(tw);/*回收*/
			return TCP_TW_SUCCESS;
		}

		/* New data or FIN. If new data arrive after half-duplex close,
		 * reset.
		 */
		if (!th->fin ||/*case F.5:如果這個包最後也不是FIN,或者結果發現是新數據包 就RST掉吧*/
		    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
kill_with_rst:
			inet_twsk_deschedule(tw, &tcp_death_row);
			inet_twsk_put(tw);/*over~*/
			return TCP_TW_RST;
		}

		/* FIN arrived, enter true time-wait state. */
		tw->tw_substate	  = TCP_TIME_WAIT;/*case F.6:終於等到FIN了,成功躍遷到TIME_WAIT吧*/
		tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;/*最後一包*/
		if (tmp_opt.saw_tstamp) {
			tcptw->tw_ts_recent_stamp = get_seconds();
			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
		}
		/*proc:tcp_tw_recycle啓動了快速回收*/
		if (tcp_death_row.sysctl_tw_recycle &&/*重新設置 inet_peer->dtime*/
		    tcptw->tw_ts_recent_stamp &&
		    tcp_tw_remember_stamp(tw))
			inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,/*取決於RTO 這個後面分析*/
					   TCP_TIMEWAIT_LEN);
		else
			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
					   TCP_TIMEWAIT_LEN);
		return TCP_TW_ACK;
	}

	/*
	 *	Now real TIME-WAIT state. 
	 *
	 *	RFC 1122:
	 *	"When a connection is [...] on TIME-WAIT state [...]
	 *	[a TCP] MAY accept a new SYN from the remote TCP to
	 *	reopen the connection directly, if it: 這個遇到了再解釋
	 *
	 *	(1)  assigns its initial sequence number for the new
	 *	connection to be larger than the largest sequence
	 *	number it used on the previous connection incarnation,
	 *	and
	 *
	 *	(2)  returns to TIME-WAIT state if the SYN turns out
	 *	to be an old duplicate".
	 */
	/*下面就是對真正處於TIME_WAIT的sock處理*/
	if (!paws_reject &&
	    (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&/*這個包seq要是我們期待的*/
	     (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
		/* In window segment, it may be only reset or bare ack. */

		if (th->rst) {/*先處理掉RST*/
			/* This is TIME_WAIT assassination, in two flavors.
			 * Oh well... nobody has a sufficient solution to this
			 * protocol bug yet.
			 *//*TIME_WAIT下收到RST是否清理,要看是否遵守rfc1337(proc:tcp_rfc1337)*/
			if (sysctl_tcp_rfc1337 == 0) {
kill:
				inet_twsk_deschedule(tw, &tcp_death_row);/*從kill的來源可以看到如果是FIN_WAIT_2收到RST:直接去除TIME_WAIT*/
				inet_twsk_put(tw);
				return TCP_TW_SUCCESS;
			}
		}
		inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
				   TCP_TIMEWAIT_LEN);

		if (tmp_opt.saw_tstamp) {
			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
			tcptw->tw_ts_recent_stamp = get_seconds();/*更新時間撮*/
		}

		inet_twsk_put(tw);
		return TCP_TW_SUCCESS;
	}

	/* Out of window segment. 下面就是seq異常TCP包的處理

	   All the segments are ACKed immediately.

	   The only exception is new SYN. We accept it, if it is
	   not old duplicate and we are not in danger to be killed
	   by delayed old duplicates. RFC check is that it has
	   newer sequence number works at rates <40Mbit/sec.
	   However, if paws works, it is reliable AND even more,
	   we even may relax silly seq space cutoff.

	   RED-PEN: we violate main RFC requirement, if this SYN will appear
	   old duplicate (i.e. we receive RST in reply to SYN-ACK),
	   we must return socket to time-wait state. It is not good,
	   but not fatal yet.
	 */
	/*我之前一直以爲對於BSD socket,新到SYN seq >上一次保留的最後recv_seq,那麼將會接受這個SYN 而不是丟掉
		其實這是rfc 1122規定的*/
	if (th->syn && !th->rst && !th->ack && !paws_reject &&
	    (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
	     (tmp_opt.saw_tstamp &&
	      (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
		u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
		if (isn == 0)
			isn++;
		TCP_SKB_CB(skb)->when = isn;
		return TCP_TW_SYN;/*從這裏出去就恍然大悟了*/
	}

	if (paws_reject)
		NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);

	if (!th->rst) {/*異常的數據包和SYN等 直接ACK掉*/
		/* In this case we must reset the TIMEWAIT timer.
		 *
		 * If it is ACKless SYN it may be both old duplicate
		 * and new good SYN with random sequence number <rcv_nxt.
		 * Do not reschedule in the last case.
		 */
		if (paws_reject || th->ack)/*如果是SYN不應該影響TIMEWAIT的消亡時間*/
			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
					   TCP_TIMEWAIT_LEN);

		/* Send ACK. Note, we do not put the bucket,
		 * it will be released by caller, 這裏說了 交給用戶去free
		 */
		return TCP_TW_ACK;
	}
	inet_twsk_put(tw);
	return TCP_TW_SUCCESS;
}

 

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章