syn-proxy logic2
主要的調用關係如下:
ip_vs_in() -→
conn_schedule() ==> tcp_conn_schedule() -→ ip_vs_synproxy_ack_rcv()
依賴NF_INET_PRE_ROUTING鏈上的ip_vs_in()hook函數,該hook函數用來確認當前ack報文是否存在對應的syn-cookie來判斷是否爲正常的ack報文,如果爲正常的ack報文則向rs發送SYN報文來發起連接。ip_vs_in()源碼如下:
/*
* Check if it's for virtual services, look it up,
* and send it on its way...
*/
static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn) (struct sk_buff *))
{
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_conn *cp;
int ret, restart, af, pkts;
int v = NF_DROP; /* for FULLNAT */
int res_dir; /* for FULLNAT */
af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
/*
* Big tappo: only PACKET_HOST, including loopback for local client
* Don't handle local packets on IPv6 for now
*/
/* pkt_type可取如下值:
* PACKET_HOST:這是一個發往本機的數據包
* PACKET_BROADCAST:廣播數據包
* PACKET_MULTICAST:多播數據包
* PACKET_OTHERHOST:該數據包是發往其它機器的,如果本機沒有被配置爲轉發功能,該數據包即被丟棄
* 對於ip_vs來說只需關注發給本機的數據報即可,其他數據報由內核協議棧繼續處理
*/
if (unlikely(skb->pkt_type != PACKET_HOST)) {
IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
skb->pkt_type,
iph.protocol,
IP_VS_DBG_ADDR(af, &iph.daddr));
return NF_ACCEPT;
}
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
if (related)
return verdict;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
} else
#endif
/* 處理ICMP協議報文,其中包括:
* 1. 對ip層分片數據包的重組
* 2. 對ICMP TYPE爲DEST_UNREACH、SOURCE_QUENCH 、TIME_EXCEEDED的報文處理,其他類型的icmp報文交給協議棧繼續處理
*/
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
if (related)
return verdict;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
/* Protocol supported? */
pp = ip_vs_proto_get(iph.protocol);
if (unlikely(!pp))
return NF_ACCEPT;
/*
* Check if the packet belongs to an existing connection entry
*/
/* 對於處於syn-proxy logic2的流來說,此時針對與這條流的連接還未創建,因此將會直接執行create connection部分 */
cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0, &res_dir);
if (likely(cp)) {
/* For full-nat/local-client packets, it could be a response */
if (res_dir == IP_VS_CIDX_F_IN2OUT) {
return handle_response(af, skb, pp, cp, iph.len);
}
} else {
/* create a new connection */
int v;
/* syn-proxy logic2的主要邏輯就在tcp_conn_schedule()---→ ip_vs_synproxy_ack_rcv() 中,我們下面來着重分析一下 */
if (!pp->conn_schedule(af, skb, pp, &v, &cp))
return v;
}
if (unlikely(!cp)) {
/* sorry, all this trouble for a no-hit :) */
IP_VS_DBG_PKT(12, pp, skb, 0,
"packet continues traversal as normal");
return NF_ACCEPT;
}
IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
if (sysctl_ip_vs_expire_nodest_conn) {
/* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
}
/* don't restart its timer, and silently
drop the packet. */
__ip_vs_conn_put(cp);
return NF_DROP;
}
tcp_conn_schedule()函數源碼如下:
static int
tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
int *verdict, struct ip_vs_conn **cpp)
{
struct ip_vs_service *svc;
struct tcphdr _tcph, *th;
struct ip_vs_iphdr iph;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
if (th == NULL) {
*verdict = NF_DROP;
return 0;
}
/*
* Syn-proxy step 2 logic: receive client's
* 3-handshake Ack packet
*/
if (ip_vs_synproxy_ack_rcv(af, skb, th, pp, cpp, &iph, verdict) == 0) {
return 0;
}
if (th->syn && !th->ack && !th->fin && !th->rst &&
(svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
th->dest))) {
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/
ip_vs_service_put(svc);
*verdict = NF_DROP;
return 0;
}
/*
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
*cpp = ip_vs_schedule(svc, skb, 0);
if (!*cpp) {
*verdict = ip_vs_leave(svc, skb, pp);
return 0;
}
ip_vs_service_put(svc);
return 1;
}
/* drop tcp packet which send to vip and !vport */
if (sysctl_ip_vs_tcp_drop_entry &&
(svc = ip_vs_lookup_vip(af, iph.protocol, &iph.daddr))) {
IP_VS_INC_ESTATS(ip_vs_esmib, DEFENCE_TCP_DROP);
*verdict = NF_DROP;
return 0;
}
return 1;
}
ip_vs_synproxy_ack_rcv()源碼如下:
/*
* Syn-proxy step 2 logic
* Receive client's 3-handshakes Ack packet, do cookie check
* and then send syn to rs after creating a session.
*
*/
int
ip_vs_synproxy_ack_rcv(int af, struct sk_buff *skb, struct tcphdr *th,
struct ip_vs_protocol *pp, struct ip_vs_conn **cpp,
struct ip_vs_iphdr *iph, int *verdict)
{
struct ip_vs_synproxy_opt opt;
struct ip_vs_service *svc;
int res_cookie_check;
/*
* Don't check svc syn-proxy flag, as it may
* be changed after syn-proxy step 1.
*/
/* 判斷是否爲ack包,並且能夠根據請求的dst ip及port拿到對應的svc結構體 */
if (!th->syn && th->ack && !th->rst && !th->fin &&
(svc =
ip_vs_service_get(af, skb->mark, iph->protocol, &iph->daddr,
th->dest))) {
/* 當前load太高,需要丟棄該數據包 */
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/
ip_vs_service_put(svc);
*verdict = NF_DROP;
return 0;
}
/* 在開啓synproxy_defer時,ack包中必須要存在payload */
if (sysctl_ip_vs_synproxy_defer &&
!syn_proxy_ack_has_data(skb, iph, th)) {
/* update statistics */
IP_VS_INC_ESTATS(ip_vs_esmib, SYNPROXY_NULL_ACK);
/*
* When expecting ack packet with payload,
* we get a pure ack, so have to drop it.
*/
ip_vs_service_put(svc);
*verdict = NF_DROP;
return 0;
}
/*
* Import: set tcp hdr before cookie check, as it
* will be used in cookie_check funcs.
*/
skb_set_transport_header(skb, iph→len);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
res_cookie_check = ip_vs_synproxy_v6_cookie_check(skb,
ntohl
(th->
ack_seq)
- 1,
&opt);
} else
#endif
{
/* ip_vs_synproxy_v4_cookie_check()
* 1.使用check_tcp_syn_cookie()來校驗該ack包是否合法:
* 以下爲check_tcp_syn_cookie()源碼:
* static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
* __be16 sport, __be16 dport, __u32 sseq,
* __u32 count, __u32 maxdiff)
* {
* __u32 diff;
*
* /* Strip away the layers from the cookie */
* /* 這裏的cookie就是client ack包的ack seq -1(即lvs發送給client的syn-ack報文的seq,該seq存儲了client syn包的各種信息,以此來校驗是否爲合法的三次握手報文) */
* cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; // 從cookie中拿掉根據這個tcp流相關信息拿到的sha1 hash值1和syn包的seq值
* /* 在synproxy logic1中,我們知道syn-cookie中的高8位爲系統的開機分鐘數,低24位由地址、端口、開機分鐘數計算出的sha1 hash值2(32bit)和根據tcp option拼接成
* * 的data計算得出,而我們在判斷本次收到的包是否爲正常的建連請求時,只需對比這時的cookie高8位與當前系統開機分鐘數的差值,即syn包與ack包到達間隔的最大值
* * 是否滿足系統的設定值即可。若滿足,則從cookie中去除sha1 hash值2並返回(此時的返回值中僅包含低22位即client syn包的tcp option),否則返回(__u32)-1
* */
* /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
* diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS);
* if (diff >= maxdiff)
* return (__u32)-1;
*
* return (cookie -
* cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
* & COOKIEMASK; /* Leaving the data behind */
* }
* 2. 根據ack包的tcp option更改opt的值
*/
res_cookie_check = ip_vs_synproxy_v4_cookie_check(skb,
ntohl
(th->
ack_seq)
- 1,
&opt);
}
if (!res_cookie_check) {
/* cookie不可用,丟棄 */
/* update statistics */
IP_VS_INC_ESTATS(ip_vs_esmib, SYNPROXY_BAD_ACK);
/*
* Cookie check fail, drop it.
*/
IP_VS_DBG(6, "syn_cookie check failed seq=%u\n",
ntohl(th->ack_seq) - 1);
ip_vs_service_put(svc);
*verdict = NF_DROP;
return 0;
}
/* update statistics */
IP_VS_INC_ESTATS(ip_vs_esmib, SYNPROXY_OK_ACK);
/* 此時判斷爲正常的連接請求,開始分配相關資源 */
/*
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
*cpp = ip_vs_schedule(svc, skb, 1);
if (!*cpp) {
IP_VS_DBG(6, "ip_vs_schedule failed\n");
*verdict = ip_vs_leave(svc, skb, pp);
return 0;
}
/*
* Release service, we don't need it any more.
*/
ip_vs_service_put(svc);
/*
* Do anything but print a error msg when fail.
* Because session will be correctly freed in ip_vs_conn_expire.
*/
/* 向rs發送syn包開始三次握手 */
if (!syn_proxy_send_rs_syn(af, th, *cpp, skb, pp, &opt)) {
IP_VS_ERR_RL("syn_proxy_send_rs_syn failed!\n");
}
/* count in the ack packet (STOLEN by synproxy) */
ip_vs_in_stats(*cpp, skb);
/*
* Active sesion timer, and dec refcnt.
* Also stole the skb, and let caller return immediately.
*/
ip_vs_conn_put(*cpp);
*verdict = NF_STOLEN;
return 0;
}
return 1;
}
syn_proxy_send_rs_syn()
/*
* Create syn packet and send it to rs.
* ATTENTION: we also store syn skb in cp if syn retransimition
* is tured on.
*/
static int
syn_proxy_send_rs_syn(int af, const struct tcphdr *th,
struct ip_vs_conn *cp, struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_synproxy_opt *opt)
{
struct sk_buff *syn_skb;
int tcp_hdr_size;
__u8 tcp_flags = TCPCB_FLAG_SYN;
unsigned int tcphoff;
struct tcphdr *new_th;
if (!cp->packet_xmit) {
IP_VS_ERR_RL("warning: packet_xmit is null");
return 0;
}
syn_skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC);
if (unlikely(syn_skb == NULL)) {
IP_VS_ERR_RL("alloc skb failed when send rs syn packet\n");
return 0;
}
/* Reserve space for headers */
skb_reserve(syn_skb, MAX_TCP_HEADER);
tcp_hdr_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
(opt->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
(opt->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
/* SACK_PERM is in the place of NOP NOP of TS */
((opt->sack_ok
&& !opt->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
new_th = (struct tcphdr *)skb_push(syn_skb, tcp_hdr_size);
/* Compose tcp header */
skb_reset_transport_header(syn_skb);
syn_skb->csum = 0;
/* Set tcp hdr */
new_th->source = th->source;
new_th->dest = th->dest;
new_th->seq = htonl(ntohl(th->seq) - 1);
new_th->ack_seq = 0;
*(((__u16 *) new_th) + 6) =
htons(((tcp_hdr_size >> 2) << 12) | tcp_flags);
/* FIX_ME: what window should we use */
new_th->window = htons(5000);
new_th->check = 0;
new_th->urg_ptr = 0;
new_th->urg = 0;
new_th->ece = 0;
new_th->cwr = 0;
syn_proxy_syn_build_options((__be32 *) (new_th + 1), opt);
/*
* Set ip hdr
* Attention: set source and dest addr to ack skb's.
* we rely on packet_xmit func to do NATs thing.
*/
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
struct ipv6hdr *ack_iph = ipv6_hdr(skb);
struct ipv6hdr *iph =
(struct ipv6hdr *)skb_push(syn_skb, sizeof(struct ipv6hdr));
tcphoff = sizeof(struct ipv6hdr);
skb_reset_network_header(syn_skb);
memcpy(&iph->saddr, &ack_iph->saddr, sizeof(struct in6_addr));
memcpy(&iph->daddr, &ack_iph->daddr, sizeof(struct in6_addr));
iph->version = 6;
iph->nexthdr = NEXTHDR_TCP;
iph->payload_len = htons(tcp_hdr_size);
iph->hop_limit = IPV6_DEFAULT_HOPLIMIT;
new_th->check = 0;
syn_skb->csum =
skb_checksum(syn_skb, tcphoff, syn_skb->len - tcphoff, 0);
new_th->check =
csum_ipv6_magic(&iph->saddr, &iph→daddr,
syn_skb->len - tcphoff, IPPROTO_TCP,
syn_skb->csum);
} else
#endif
{
struct iphdr *ack_iph = ip_hdr(skb);
u32 rtos = RT_TOS(ack_iph->tos);
struct iphdr *iph =
(struct iphdr *)skb_push(syn_skb, sizeof(struct iphdr));
tcphoff = sizeof(struct iphdr);
skb_reset_network_header(syn_skb);
*((__u16 *) iph) = htons((4 << 12) | (5 << 8) | (rtos & 0xff));
iph->tot_len = htons(syn_skb->len);
iph->frag_off = htons(IP_DF);
/* FIX_ME: what ttl shoule we use */
iph->ttl = IPDEFTTL;
iph->protocol = IPPROTO_TCP;
iph->saddr = ack_iph->saddr;
iph->daddr = ack_iph->daddr;
ip_send_check(iph);
new_th->check = 0;
syn_skb->csum =
skb_checksum(syn_skb, tcphoff, syn_skb->len - tcphoff, 0);
new_th->check =
csum_tcpudp_magic(iph->saddr, iph->daddr,
syn_skb->len - tcphoff, IPPROTO_TCP,
syn_skb->csum);
}
/* Save syn_skb if syn retransmission is on */
if (sysctl_ip_vs_synproxy_syn_retry > 0) {
cp->syn_skb = skb_copy(syn_skb, GFP_ATOMIC);
atomic_set(&cp->syn_retry_max, sysctl_ip_vs_synproxy_syn_retry);
}
/* Save info for fast_response_xmit */
if(sysctl_ip_vs_fast_xmit && skb->dev &&
likely(skb->dev->type == ARPHRD_ETHER) &&
skb_mac_header_was_set(skb)) {
struct ethhdr *eth = (struct ethhdr *)skb_mac_header(skb);
if(likely(cp->indev == NULL)) {
cp->indev = skb->dev;
dev_hold(cp->indev);
}
if (unlikely(cp->indev != skb->dev)) {
dev_put(cp->indev);
cp->indev = skb->dev;
dev_hold(cp->indev);
}
memcpy(cp->src_hwaddr, eth->h_source, ETH_ALEN);
memcpy(cp->dst_hwaddr, eth->h_dest, ETH_ALEN);
IP_VS_INC_ESTATS(ip_vs_esmib, FAST_XMIT_SYNPROXY_SAVE);
IP_VS_DBG_RL("syn_proxy_send_rs_syn netdevice:%s\n",
netdev_name(skb->dev));
}
/* count in the syn packet */
ip_vs_in_stats(cp, skb);
/* If xmit failed, syn_skb will be freed correctly. */
cp->packet_xmit(syn_skb, cp, pp);
return 1;
}