TCP爲每條連接建立七個定時器,依次爲:連接建立定時器、重傳定時器、延時ACK定時器、持續定時器、保活定時器、FIN_WAIT_2定時器和TIME_WAIT定時器。實際上,爲了提高效率,內核中只使用了四個定時器來完成七個定時器的功能。
TCP定時器的實現涉及以下文件:
net/ipv4/tcp_timer.c TCP的定時器
net/ipv4/inet_connection_sock.c 基於連接的傳輸控制塊實現
net/ipv4/tcp_output.c TCP的輸出
net/ipv4/tcp_input.c TCP的輸入
初始化
傳輸控制塊定時器的初始化函數tcp_init_xmit_timers()在創建套接口、傳輸控制塊時被調用
void inet_csk_init_xmit_timers(struct sock *sk,
void (*retransmit_handler)(unsigned long),
void (*delack_handler)(unsigned long),
void (*keepalive_handler)(unsigned long))
{
struct inet_connection_sock *icsk = inet_csk(sk);
setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
(unsigned long)sk);
setup_timer(&icsk->icsk_delack_timer, delack_handler,
(unsigned long)sk);
setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
void tcp_init_xmit_timers(struct sock *sk)
{
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
}
連接建立定時器
連接建立定時器用於被動建立連接時,服務端在接收到客戶端的SYN,發送SYN+ACK段後,等待客戶端的ACK段,試圖建立一個新的連接時啓動,超時幾次後,連接建立將中止。
連接建立定時器處理函數會刪除那些嘗試連接的次數達到上限還未完成連接建立的傳輸控制塊,該上限由inet_connection_sock結構的icsk_syn_retries給出。
連接建立定時器處理函數tcp_keepalive_timer(),該函數實現了TCP中的三個定時器:連接建立定時器、保活定時器和FIN_WAIT_2定時器。這是由於這三個定時器分別處於LISTEN、ESTABLISHED和FIN_WAIT2三種狀態。
static void tcp_keepalive_timer (unsigned long data)
{
struct sock *sk = (struct sock *) data;
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u32 elapsed;
/* Only process if socket is not in use. */
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
inet_csk_reset_keepalive_timer (sk, HZ/20);
goto out;
}
if (sk->sk_state == TCP_LISTEN) {
tcp_synack_timer(sk);
goto out;
}
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
if (tp->linger2 >= 0) {
const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
if (tmo > 0) {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto out;
}
}
tcp_send_active_reset(sk, GFP_ATOMIC);
goto death;
}
if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
goto out;
elapsed = keepalive_time_when(tp);
/* It is alive without keepalive 8) */
if (tp->packets_out || tcp_send_head(sk))
goto resched;
elapsed = tcp_time_stamp - tp->rcv_tstamp;
if (elapsed >= keepalive_time_when(tp)) {
if (icsk->icsk_probes_out >= keepalive_probes(tp)) {
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_write_err(sk);
goto out;
}
if (tcp_write_wakeup(sk) <= 0) {
icsk->icsk_probes_out++;
elapsed = keepalive_intvl_when(tp);
} else {
/* If keepalive was lost due to local congestion,
* try harder.
*/
elapsed = TCP_RESOURCE_PROBE_INTERVAL;
}
} else {
/* It is tp->rcv_tstamp + keepalive_time_when(tp) */
elapsed = keepalive_time_when(tp) - elapsed;
}
TCP_CHECK_TIMER(sk);
sk_mem_reclaim(sk);
resched:
inet_csk_reset_keepalive_timer (sk, elapsed);
goto out;
death:
tcp_done(sk);
out:
bh_unlock_sock(sk);
sock_put(sk);
}
重傳定時器
重傳定時器在TCP發送數據時設定,如果定時器已超時而對端確認還未到達,則TCP將重傳數據。重傳定時器的超時時間值是動態計算的,取決於TCP爲該連接測量的往返時間以及該段已被重傳的次數
static void tcp_write_timer(unsigned long data)
{
struct sock *sk = (struct sock *)data;
struct inet_connection_sock *icsk = inet_csk(sk);
int event;
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later */
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
goto out_unlock;
}
if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
goto out;
if (time_after(icsk->icsk_timeout, jiffies)) {
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
goto out;
}
event = icsk->icsk_pending;
icsk->icsk_pending = 0;
switch (event) {
case ICSK_TIME_RETRANS:
tcp_retransmit_timer(sk);
break;
case ICSK_TIME_PROBE0:
tcp_probe_timer(sk);
break;
}
TCP_CHECK_TIMER(sk);
out:
sk_mem_reclaim(sk);
out_unlock:
bh_unlock_sock(sk);
sock_put(sk);
}
延時確認定時器
延時ACK定時器在TCP收到必須被確認但無需馬上發出確認的段時設定,TCP在200ms後發送確認響應,如果在200ms內,有數據要在該連接上發送,延時ACK響應就可以隨數據一起發送回對端,稱爲捎帶確認。
static void tcp_delack_timer(unsigned long data)
{
struct sock *sk = (struct sock *)data;
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
icsk->icsk_ack.blocked = 1;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
goto out_unlock;
}
sk_mem_reclaim_partial(sk);
if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
goto out;
if (time_after(icsk->icsk_ack.timeout, jiffies)) {
sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
goto out;
}
icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
if (!skb_queue_empty(&tp->ucopy.prequeue)) {
struct sk_buff *skb;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk_backlog_rcv(sk, skb);
tp->ucopy.memory = 0;
}
if (inet_csk_ack_scheduled(sk)) {
if (!icsk->icsk_ack.pingpong) {
/* Delayed ACK missed: inflate ATO. */
icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
} else {
/* Delayed ACK missed: leave pingpong mode and
* deflate ATO.
*/
icsk->icsk_ack.pingpong = 0;
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
tcp_send_ack(sk);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
}
TCP_CHECK_TIMER(sk);
out:
if (tcp_memory_pressure)
sk_mem_reclaim(sk);
out_unlock:
bh_unlock_sock(sk);
sock_put(sk);
}
持續定時器
持續定時器在對端通告接收窗口爲0,阻止TCP繼續發送數據時而設定。由於連接對端發送的窗口通告不可靠(只有數據纔會確認,ACK不會被確認),允許TCP繼續發送數據的後續窗口更新可能丟失,因此,如果TCP有數據要發送,而對端通告接收窗口爲0,則持續定時器啓動,超時後向對端發送1字節的數據,以判斷對端接收窗口是否已經打開。
static void tcp_probe_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int max_probes;
if (tp->packets_out || !tcp_send_head(sk)) {
icsk->icsk_probes_out = 0;
return;
}
/* *WARNING* RFC 1122 forbids this
*
* It doesn't AFAIK, because we kill the retransmit timer -AK
*
* FIXME: We ought not to do it, Solaris 2.5 actually has fixing
* this behaviour in Solaris down as a bug fix. [AC]
*
* Let me to explain. icsk_probes_out is zeroed by incoming ACKs
* even if they advertise zero window. Hence, connection is killed only
* if we received no ACKs for normal connection timeout. It is not killed
* only because window stays zero for some time, window may be zero
* until armageddon and even later. We are in full accordance
* with RFCs, only probe timer combines both retransmission timeout
* and probe timeout in one bottle. --ANK
*/
max_probes = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
max_probes = tcp_orphan_retries(sk, alive);
if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
return;
}
if (icsk->icsk_probes_out > max_probes) {
tcp_write_err(sk);
} else {
/* Only send another probe if we didn't close things up. */
tcp_send_probe0(sk);
}
}
保活定時器
保活定時器在應用進程選取了套接口SO_KEEPALIVE選項時生效。如果連接的連續空閒時間超過2小時,則保活定時器超時,向對端發送連接探測段,強迫對端相應。相應處理函數tcp_keepalive_timer()。
FIN_WAIT_2定時器
當某個連接從FIN_WAIT_1狀態變遷到FIN_WAIT_2狀態,且不能再接收任何新數據時,則意味着應用進程調用了close()而非shutdown(),沒有利用TCP的半關閉功能,FIN_WAIT_2定時器啓動,超時時間爲10min,在定時器第一次超時後,重新設置超時時間爲75s,第二次超時後關閉連接。加入這個定時器的目的是爲了避免對端一直不發FIN,某個連接會永遠滯留在FIN_WAIT_2狀態。處理函數爲tcp_keepalive_timer()。