syn-proxy源碼分析(2)

syn-proxy logic2

主要的調用關係如下:

ip_vs_in() -→

       conn_schedule() ==> tcp_conn_schedule()

                                                -→ ip_vs_synproxy_ack_rcv()

依賴NF_INET_PRE_ROUTING鏈上的ip_vs_in()hook函數,該hook函數用來確認當前ack報文是否存在對應的syn-cookie來判斷是否爲正常的ack報文,如果爲正常的ack報文則向rs發送SYN報文來發起連接。ip_vs_in()源碼如下:

/*
* Check if it's for virtual services, look it up,
* and send it on its way...
*/
static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
        const struct net_device *in, const struct net_device *out,
        int (*okfn) (struct sk_buff *))
{
      struct ip_vs_iphdr iph; 
      struct ip_vs_protocol *pp; 
      struct ip_vs_conn *cp; 
      int ret, restart, af, pkts;
      int v = NF_DROP; /* for FULLNAT */
      int res_dir; /* for FULLNAT */

      af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;

      ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);

      /* 
       * Big tappo: only PACKET_HOST, including loopback for local client
       * Don't handle local packets on IPv6 for now
       */

       /* pkt_type可取如下值:

        * PACKET_HOST:這是一個發往本機的數據包

        * PACKET_BROADCAST:廣播數據包

        * PACKET_MULTICAST:多播數據包

        * PACKET_OTHERHOST:該數據包是發往其它機器的,如果本機沒有被配置爲轉發功能,該數據包即被丟棄

        * 對於ip_vs來說只需關注發給本機的數據報即可,其他數據報由內核協議棧繼續處理

        */
       if (unlikely(skb->pkt_type != PACKET_HOST)) {
           IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
               skb->pkt_type,
               iph.protocol,
               IP_VS_DBG_ADDR(af, &iph.daddr));
           return NF_ACCEPT;
}

#ifdef CONFIG_IP_VS_IPV6
     if (af == AF_INET6) {
         if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
             int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);

             if (related)
                 return verdict;
             ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
         } 
    } else

#endif

   /* 處理ICMP協議報文,其中包括:

    * 1. 對ip層分片數據包的重組

    * 2. 對ICMP TYPE爲DEST_UNREACH、SOURCE_QUENCH 、TIME_EXCEEDED的報文處理,其他類型的icmp報文交給協議棧繼續處理

    */

   if (unlikely(iph.protocol == IPPROTO_ICMP)) {
         int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);

       if (related)
           return verdict;
       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
   }

   /* Protocol supported? */
   pp = ip_vs_proto_get(iph.protocol);
   if (unlikely(!pp))
       return NF_ACCEPT;

   /*
    * Check if the packet belongs to an existing connection entry
    */

 /* 對於處於syn-proxy logic2的流來說,此時針對與這條流的連接還未創建,因此將會直接執行create connection部分 */
   cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0, &res_dir);

   if (likely(cp)) {
   /* For full-nat/local-client packets, it could be a response */
       if (res_dir == IP_VS_CIDX_F_IN2OUT) {
           return handle_response(af, skb, pp, cp, iph.len);
       }
   } else {
   /* create a new connection */
   int v;
/* syn-proxy logic2的主要邏輯就在tcp_conn_schedule()---→ ip_vs_synproxy_ack_rcv() 中,我們下面來着重分析一下 */
   if (!pp->conn_schedule(af, skb, pp, &v, &cp))
       return v;
   }

   if (unlikely(!cp)) {
       /* sorry, all this trouble for a no-hit :) */
       IP_VS_DBG_PKT(12, pp, skb, 0,
           "packet continues traversal as normal");
       return NF_ACCEPT;
   }

   IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");

   /* Check the server status */

   if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
      /* the destination server is not available */

     if (sysctl_ip_vs_expire_nodest_conn) {
         /* try to expire the connection immediately */
         ip_vs_conn_expire_now(cp);
   }
   /* don't restart its timer, and silently
      drop the packet. */
   __ip_vs_conn_put(cp);
   return NF_DROP;

}

tcp_conn_schedule()函數源碼如下:

static int
tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, 
                                int *verdict, struct ip_vs_conn **cpp)
{ 
      struct ip_vs_service *svc;
      struct tcphdr _tcph, *th; 
      struct ip_vs_iphdr iph; 

      ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);

      th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
      if (th == NULL) {
          *verdict = NF_DROP;
          return 0;
      } 

      /* 
       * Syn-proxy step 2 logic: receive client's
       * 3-handshake Ack packet
       */
       if (ip_vs_synproxy_ack_rcv(af, skb, th, pp, cpp, &iph, verdict) == 0) { 
           return 0;
       } 

       if (th->syn && !th->ack && !th->fin && !th->rst &&
           (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
                      th->dest))) {
            if (ip_vs_todrop()) {
                  /* 
                   * It seems that we are very loaded.
                   * We have to drop this packet :(
                   */ 
                   ip_vs_service_put(svc);
                   *verdict = NF_DROP;
                    return 0;  

            } 

            /* 
             * Let the virtual server select a real server for the
             * incoming connection, and create a connection entry.
             */
             *cpp = ip_vs_schedule(svc, skb, 0);

             if (!*cpp) {
                  *verdict = ip_vs_leave(svc, skb, pp);
                  return 0;
             }
             ip_vs_service_put(svc);
             return 1;
         }

         /* drop tcp packet which send to vip and !vport */
         if (sysctl_ip_vs_tcp_drop_entry &&
            (svc = ip_vs_lookup_vip(af, iph.protocol, &iph.daddr))) {
                 IP_VS_INC_ESTATS(ip_vs_esmib, DEFENCE_TCP_DROP);
                 *verdict = NF_DROP;
                 return 0;
        }

       return 1;
}

ip_vs_synproxy_ack_rcv()源碼如下:

/* 
* Syn-proxy step 2 logic
* Receive client's 3-handshakes Ack packet, do cookie check
* and then send syn to rs after creating a session.
* 
*/ 
int 
ip_vs_synproxy_ack_rcv(int af, struct sk_buff *skb, struct tcphdr *th, 
                           struct ip_vs_protocol *pp, struct ip_vs_conn **cpp,
                           struct ip_vs_iphdr *iph, int *verdict)
{ 
      struct ip_vs_synproxy_opt opt; 
      struct ip_vs_service *svc;
      int res_cookie_check;

     /* 
      * Don't check svc syn-proxy flag, as it may
      * be changed after syn-proxy step 1.
      */

/* 判斷是否爲ack包,並且能夠根據請求的dst ip及port拿到對應的svc結構體 */
      if (!th->syn && th->ack && !th->rst && !th->fin &&
           (svc =
               ip_vs_service_get(af, skb->mark, iph->protocol, &iph->daddr,
                                       th->dest))) {

/* 當前load太高,需要丟棄該數據包 */
          if (ip_vs_todrop()) {
               /* 
                * It seems that we are very loaded.
                * We have to drop this packet :(
                */
                ip_vs_service_put(svc);
                *verdict = NF_DROP;
                return 0;
          } 
 /* 在開啓synproxy_defer時,ack包中必須要存在payload */
         if (sysctl_ip_vs_synproxy_defer &&
               !syn_proxy_ack_has_data(skb, iph, th)) {
               /* update statistics */
              IP_VS_INC_ESTATS(ip_vs_esmib, SYNPROXY_NULL_ACK);
               /*
                * When expecting ack packet with payload,
                * we get a pure ack, so have to drop it.
                */
               ip_vs_service_put(svc);

               *verdict = NF_DROP;
                return 0;
          } 

          /* 
           * Import: set tcp hdr before cookie check, as it
           * will be used in cookie_check funcs.
           */ 
          skb_set_transport_header(skb, iph→len);

#ifdef CONFIG_IP_VS_IPV6
         if (af == AF_INET6) {
              res_cookie_check = ip_vs_synproxy_v6_cookie_check(skb,
                                                                     ntohl
                                                                     (th->
                                                                      ack_seq)
                                                                      - 1,
                                                                     &opt);
         } else
#endif 
        { 

              /* ip_vs_synproxy_v4_cookie_check()

               * 1.使用check_tcp_syn_cookie()來校驗該ack包是否合法:

               *    以下爲check_tcp_syn_cookie()源碼:

               *    static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
               *                                                             __be16 sport, __be16 dport, __u32 sseq,
               *                                                            __u32 count, __u32 maxdiff)
               *  {
               *            __u32 diff;
               *
               *          /* Strip away the layers from the cookie */

               *          /* 這裏的cookie就是client ack包的ack seq -1(即lvs發送給client的syn-ack報文的seq,該seq存儲了client syn包的各種信息,以此來校驗是否爲合法的三次握手報文) */
               *          cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;  // 從cookie中拿掉根據這個tcp流相關信息拿到的sha1 hash值1和syn包的seq值

               *          /* 在synproxy logic1中,我們知道syn-cookie中的高8位爲系統的開機分鐘數,低24位由地址、端口、開機分鐘數計算出的sha1 hash值2(32bit)和根據tcp option拼接成

               *           * 的data計算得出,而我們在判斷本次收到的包是否爲正常的建連請求時,只需對比這時的cookie高8位與當前系統開機分鐘數的差值,即syn包與ack包到達間隔的最大值

               *           * 是否滿足系統的設定值即可。若滿足,則從cookie中去除sha1 hash值2並返回(此時的返回值中僅包含低22位即client syn包的tcp option),否則返回(__u32)-1

               *           */
               *         /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
               *         diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS);
               *         if (diff >= maxdiff)
               *              return (__u32)-1;
               *
               *         return (cookie -
               *                  cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) 
               *                  & COOKIEMASK; /* Leaving the data behind */
               *   }

               * 2. 根據ack包的tcp option更改opt的值

               */
              res_cookie_check = ip_vs_synproxy_v4_cookie_check(skb,
                                                                     ntohl
                                                                     (th->
                                                                      ack_seq)
                                                                      - 1,
                                                                     &opt);
        } 

        if (!res_cookie_check) {

              /* cookie不可用,丟棄 */
              /* update statistics */
              IP_VS_INC_ESTATS(ip_vs_esmib, SYNPROXY_BAD_ACK);
              /* 
               * Cookie check fail, drop it.
               */
              IP_VS_DBG(6, "syn_cookie check failed seq=%u\n",
                        ntohl(th->ack_seq) - 1);
              ip_vs_service_put(svc);
              *verdict = NF_DROP;
              return 0;
        } 

        /* update statistics */

        IP_VS_INC_ESTATS(ip_vs_esmib, SYNPROXY_OK_ACK);

        /* 此時判斷爲正常的連接請求,開始分配相關資源 */

        /*
         * Let the virtual server select a real server for the
         * incoming connection, and create a connection entry.
         */
         *cpp = ip_vs_schedule(svc, skb, 1);
         if (!*cpp) {
              IP_VS_DBG(6, "ip_vs_schedule failed\n");
              *verdict = ip_vs_leave(svc, skb, pp);
              return 0;
         } 

        /*
         * Release service, we don't need it any more.
         */
         ip_vs_service_put(svc);

        /*
         * Do anything but print a error msg when fail.
         * Because session will be correctly freed in ip_vs_conn_expire.
         */

         /* 向rs發送syn包開始三次握手 */
         if (!syn_proxy_send_rs_syn(af, th, *cpp, skb, pp, &opt)) {
                IP_VS_ERR_RL("syn_proxy_send_rs_syn failed!\n");
        }

        /* count in the ack packet (STOLEN by synproxy) */
        ip_vs_in_stats(*cpp, skb);

        /*
         * Active sesion timer, and dec refcnt. 
         * Also stole the skb, and let caller return immediately.
         */
         ip_vs_conn_put(*cpp);
         *verdict = NF_STOLEN;
          return 0;
     } 

     return 1;
}

syn_proxy_send_rs_syn()

/* 
* Create syn packet and send it to rs.
* ATTENTION: we also store syn skb in cp if syn retransimition
* is tured on.
*/ 
static int 
syn_proxy_send_rs_syn(int af, const struct tcphdr *th,

            struct ip_vs_conn *cp, struct sk_buff *skb,
            struct ip_vs_protocol *pp, struct ip_vs_synproxy_opt *opt)
{ 
        struct sk_buff *syn_skb;
        int tcp_hdr_size;
        __u8 tcp_flags = TCPCB_FLAG_SYN;
        unsigned int tcphoff;
        struct tcphdr *new_th;

        if (!cp->packet_xmit) {
             IP_VS_ERR_RL("warning: packet_xmit is null");
             return 0;
        } 

        syn_skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC);
        if (unlikely(syn_skb == NULL)) {
            IP_VS_ERR_RL("alloc skb failed when send rs syn packet\n");
            return 0;
        } 

        /* Reserve space for headers */
       skb_reserve(syn_skb, MAX_TCP_HEADER);
       tcp_hdr_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
                     (opt->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + 
                     (opt->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + 
                     /* SACK_PERM is in the place of NOP NOP of TS */
                    ((opt->sack_ok
                      && !opt->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); 

       new_th = (struct tcphdr *)skb_push(syn_skb, tcp_hdr_size);
       /* Compose tcp header */
       skb_reset_transport_header(syn_skb);
       syn_skb->csum = 0; 

       /* Set tcp hdr */

      new_th->source = th->source;
      new_th->dest = th->dest;
      new_th->seq = htonl(ntohl(th->seq) - 1);
     new_th->ack_seq = 0;
     *(((__u16 *) new_th) + 6) =
               htons(((tcp_hdr_size >> 2) << 12) | tcp_flags);
     /* FIX_ME: what window should we use */
     new_th->window = htons(5000);
     new_th->check = 0;
     new_th->urg_ptr = 0;
     new_th->urg = 0;
     new_th->ece = 0;
     new_th->cwr = 0;

     syn_proxy_syn_build_options((__be32 *) (new_th + 1), opt);

     /* 
      * Set ip hdr
      * Attention: set source and dest addr to ack skb's.
      * we rely on packet_xmit func to do NATs thing.
      */ 
     #ifdef CONFIG_IP_VS_IPV6
     if (af == AF_INET6) {
     struct ipv6hdr *ack_iph = ipv6_hdr(skb);
     struct ipv6hdr *iph =
               (struct ipv6hdr *)skb_push(syn_skb, sizeof(struct ipv6hdr));

     tcphoff = sizeof(struct ipv6hdr);
     skb_reset_network_header(syn_skb);
     memcpy(&iph->saddr, &ack_iph->saddr, sizeof(struct in6_addr));
     memcpy(&iph->daddr, &ack_iph->daddr, sizeof(struct in6_addr));

     iph->version = 6;
     iph->nexthdr = NEXTHDR_TCP;
     iph->payload_len = htons(tcp_hdr_size);
     iph->hop_limit = IPV6_DEFAULT_HOPLIMIT;

     new_th->check = 0;
     syn_skb->csum =
               skb_checksum(syn_skb, tcphoff, syn_skb->len - tcphoff, 0);
     new_th->check =
               csum_ipv6_magic(&iph->saddr, &iph→daddr,

                         syn_skb->len - tcphoff, IPPROTO_TCP,
                         syn_skb->csum);
     } else
     #endif 
     { 
          struct iphdr *ack_iph = ip_hdr(skb);
          u32 rtos = RT_TOS(ack_iph->tos);
          struct iphdr *iph =
                    (struct iphdr *)skb_push(syn_skb, sizeof(struct iphdr));

          tcphoff = sizeof(struct iphdr);
          skb_reset_network_header(syn_skb);
          *((__u16 *) iph) = htons((4 << 12) | (5 << 8) | (rtos & 0xff));
          iph->tot_len = htons(syn_skb->len);
          iph->frag_off = htons(IP_DF);
          /* FIX_ME: what ttl shoule we use */
          iph->ttl = IPDEFTTL;
          iph->protocol = IPPROTO_TCP;
          iph->saddr = ack_iph->saddr;
          iph->daddr = ack_iph->daddr;

          ip_send_check(iph);

          new_th->check = 0;
          syn_skb->csum =
                    skb_checksum(syn_skb, tcphoff, syn_skb->len - tcphoff, 0);
          new_th->check =
                    csum_tcpudp_magic(iph->saddr, iph->daddr,
                    syn_skb->len - tcphoff, IPPROTO_TCP,
                    syn_skb->csum);
     }

     /* Save syn_skb if syn retransmission is on */
     if (sysctl_ip_vs_synproxy_syn_retry > 0) {
               cp->syn_skb = skb_copy(syn_skb, GFP_ATOMIC);
               atomic_set(&cp->syn_retry_max, sysctl_ip_vs_synproxy_syn_retry);
     } 

     /* Save info for fast_response_xmit */
     if(sysctl_ip_vs_fast_xmit && skb->dev &&
               likely(skb->dev->type == ARPHRD_ETHER) &&
               skb_mac_header_was_set(skb)) {

          struct ethhdr *eth = (struct ethhdr *)skb_mac_header(skb);

          if(likely(cp->indev == NULL)) {
               cp->indev = skb->dev;
               dev_hold(cp->indev);
          } 

          if (unlikely(cp->indev != skb->dev)) {
               dev_put(cp->indev);
               cp->indev = skb->dev;
               dev_hold(cp->indev);
          } 

          memcpy(cp->src_hwaddr, eth->h_source, ETH_ALEN);
          memcpy(cp->dst_hwaddr, eth->h_dest, ETH_ALEN);
          IP_VS_INC_ESTATS(ip_vs_esmib, FAST_XMIT_SYNPROXY_SAVE);
          IP_VS_DBG_RL("syn_proxy_send_rs_syn netdevice:%s\n",
          netdev_name(skb->dev));
     } 

     /* count in the syn packet */
     ip_vs_in_stats(cp, skb);

     /* If xmit failed, syn_skb will be freed correctly. */
     cp->packet_xmit(syn_skb, cp, pp);

     return 1;
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章