ip_queue的實現分析

本文檔的Copyleft歸yfydz所有，使用GPL發佈，可以自由拷貝，轉載，轉載時請保持文檔的完整性，嚴禁用於任何商業用途。
msn: [email protected]
來源：http://yfydz.cublog.cn

1. 前言

ip_queue是netfilter提供的將網絡數據包從內核傳遞到用戶空間的方法，內核中要提供ip_queue支持，在用戶層空間打開一個netlink的socket後就可以接受內核通過ip_queue所傳遞來的網絡數據包，具體數據包類型可由iptables命令來確定，只要將規則動作設置爲“-j QUEUE”即可。

之所以要命名爲ip_queue，是因爲這是一個隊列處理過程，iptables規則把指定的包發給QUEUE是一個數據進入隊列的過程，而用戶空間程序通過netlink socket獲取數據包進行裁定，結果返回內核，進行出隊列的操作。

在iptables代碼中，提供了libipq庫，封裝了對ipq的一些操作，用戶層程序可以直接使用libipq庫函數處理數據。

2. 用戶層接口：libipq

libipq主要是在iptables-<version>/libipq/libipq.c中實現，提供了以下函數：

//建立ipq的handle:
struct ipq_handle *ipq_create_handle(u_int32_t flags, u_int32_t protocol);

// 釋放ipq handle
int ipq_destroy_handle(struct ipq_handle *h);

// 讀取數據到buf中
ssize_t ipq_read(const struct ipq_handle *h,
unsigned char *buf, size_t len, int timeout);

// 設置ipq拷貝模式
int ipq_set_mode(const struct ipq_handle *h, u_int8_t mode, size_t len);

// 從buf中解析數據包結構
ipq_packet_msg_t *ipq_get_packet(const unsigned char *buf);

// 返回包的類型
int ipq_message_type(const unsigned char *buf);

// 設置對數據包的裁決
int ipq_set_verdict(const struct ipq_handle *h,
                    ipq_id_t id,
                    unsigned int verdict,
                    size_t data_len,
                    unsigned char *buf);

有了libipq，用戶層程序就很簡單了，libipq.3中提供了一個實例，比較簡單，只列出，不再贅述。

/*
* This code is GPL.
*/
#include <linux/netfilter.h>
#include <libipq.h>
#include <stdio.h>

#define BUFSIZE 2048

static void die(struct ipq_handle *h)
{
ipq_perror("passer");
ipq_destroy_handle(h);
exit(1);
}

int main(int argc, char **argv)
{
int status;
unsigned char buf[BUFSIZE];
struct ipq_handle *h;

h = ipq_create_handle(0, PF_INET);
if (!h)
  die(h);

status = ipq_set_mode(h, IPQ_COPY_PACKET, BUFSIZE);
if (status < 0)
  die(h);

do{
  status = ipq_read(h, buf, BUFSIZE, 0);
  if (status < 0)
   die(h);

  switch (ipq_message_type(buf)) {
   case NLMSG_ERROR:
    fprintf(stderr, "Received error message %d//n",
            ipq_get_msgerr(buf));
    break;

   case IPQM_PACKET: {
    ipq_packet_msg_t *m = ipq_get_packet(buf);

    status = ipq_set_verdict(h, m->packet_id,
                             NF_ACCEPT, 0, NULL);
    if (status < 0)
     die(h);
    break;
   }

   default:
    fprintf(stderr, "Unknown message type!//n");
    break;
  }
} while (1);

ipq_destroy_handle(h);
return 0;
}

3. 內核：數據包進入隊列

以下內核代碼版本爲2.4.26。

在net/core/netfilter.c中的對於要進行動作NF_QUEUE的數據處理流程爲：

nf_hook_slow()->nf_queue->queue_handler[pf].outfn

如果ip_queue模塊有效，這個queue_handler[pf].outfn函數實際上是對應ipq_enqueue_packet()函數(net/ipv4/netfilter/ip_queue.c)，這是通過下面的函數進行登記的：

/* net/ipv4/netfilter/ip_queue.c */
...
status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL);
...

ipq_enqueue_packet()函數：

static int
ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
{
int status = -EINVAL;
struct sk_buff *nskb;
struct ipq_queue_entry *entry;

// copy_mode是一個全局變量，IPQ_COPY_NONE表示還沒初始化，數據包會被丟棄
// 通常要初始化爲IPQ_COPY_META(只拷貝META信息到用戶層)或
// IPQ_COPY_PACKET(拷貝全部信息到用戶層)

if (copy_mode == IPQ_COPY_NONE)
return -EAGAIN;

// 記錄數據包的相關信息，包括其路由信息
entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
if (entry == NULL) {
printk(KERN_ERR "ip_queue: OOM in ipq_enqueue_packet()/n");
return -ENOMEM;
}

entry->info = info;
entry->skb = skb;

if (entry->info->hook == NF_IP_LOCAL_OUT) {
// 在OUTPUT點進行QUEUE時記錄相關路由信息：TOS，源、目的IP
struct iphdr *iph = skb->nh.iph;

  entry->rt_info.tos = iph->tos;
  entry->rt_info.daddr = iph->daddr;
  entry->rt_info.saddr = iph->saddr;
}

// 生成一個新的skb包，該包中保存關於entry的信息，其數據部分是準備傳遞給用戶

// 空間的數據結構也就是libipq所讀取的數據內容，如果拷貝模式是IPQ_COPY_META，

// 只包含ipq數據頭信息；如果是IPQ_COPY_PACKET，在ipq數據頭後拷貝整個skb包

// IP數據信息
nskb = ipq_build_packet_message(entry, &status);
if (nskb == NULL)
  goto err_out_free;

write_lock_bh(&queue_lock);

if (!peer_pid)
  goto err_out_free_nskb;

/* netlink_unicast will either free the nskb or attach it to a socket */
// 將該skb附加到用戶層打開的netlink socket上，放到其等待隊列中，如果不成功這丟棄該包
// ipqnl是ip_queue對應的netlink sock
// peer_pid用戶空間程序的pid
status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
if (status < 0)
goto err_out_unlock;

// 將entry信息入QUEUE隊列，等待用戶層的處理結果，如果隊列滿則丟棄該包
status = __ipq_enqueue_entry(entry);
if (status < 0)
goto err_out_unlock;

write_unlock_bh(&queue_lock);
return status;

err_out_free_nskb:
kfree_skb(nskb);

err_out_unlock:
write_unlock_bh(&queue_lock);

err_out_free:
kfree(entry);
return status;
}

所附加的META數據是這樣一個結構:

/* include/linux/netlink.h */
struct nlmsghdr
{
__u32  nlmsg_len; /* Length of message including header */
__u16  nlmsg_type; /* Message content */
__u16  nlmsg_flags; /* Additional flags */
__u32  nlmsg_seq; /* Sequence number */
__u32  nlmsg_pid; /* Sending process PID */
};

一旦數據進入了netlink sock的輸入隊列中，用戶層對數據的讀取就由netlink sock來處理了，ip_queue就不再管了，ip_queue只需要處理從用戶層發來的數據，從用戶層看是對netlink socket的寫，從內核的ip_queue看是用戶層數據的數據讀取過程。

4. 內核：讀取用戶層數據

ip_queue要讀取netlink socket中返回的處理數據結果，函數流程爲：

ipq_rcv_sk()
   |
   V
ipq_rcv_skb()
   |
   V
ipq_receive_peer()
   |
   |------------------------------+
   V                              V
ipq_set_verdict()             ipq_set_mode()
   |                              |
   V                              V
ipq_find_dequeue_entry()      __ipq_set_mode()
ipq_issue_verdict()
   |
   V
nf_reinject()

在模塊初始化時建立netlink sock：

ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk);

其接收數據函數爲ipq_rcv_sk()：

static void
ipq_rcv_sk(struct sock *sk, int len)
{
do {
struct sk_buff *skb;

  if (down_trylock(&ipqnl_sem))
   return;
// 從sock的等待隊列中取出skb
  while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
// 接收skb內容，skb中的數據格式和發送skb到ipq的格式是一樣的，前面是ipq的

// 控制頭，即META部分, 後面纔是真正的skb中的數據
   ipq_rcv_skb(skb);
// 丟棄skb包，這個skb本來就不是正常的網絡skb，而是ipq通信的skb
   kfree_skb(skb);
  }

  up(&ipqnl_sem);

} while (ipqnl && ipqnl->receive_queue.qlen);
}

ipq_rcv_skb()函數本身都是再爲ipq_receive_peer()函數作準備，忽略；

ipq_receive_peer函數：

static int
ipq_receive_peer(struct ipq_peer_msg *pmsg,
unsigned char type, unsigned int len)
{
int status = 0;

if (len < sizeof(*pmsg))
return -EINVAL;

switch (type) {
// 設置IPQ的拷貝模式：IPQ_COPY_META or IPQ_COPY_PACKET
case IPQM_MODE:
  status = ipq_set_mode(pmsg->msg.mode.value,
                        pmsg->msg.mode.range);
  break;

// 處理數據包的裁決
case IPQM_VERDICT:
  if (pmsg->msg.verdict.value > NF_MAX_VERDICT)
   status = -EINVAL;
  else
   status = ipq_set_verdict(&pmsg->msg.verdict,
                            len - sizeof(*pmsg));
   break;
default:
  status = -EINVAL;
}
return status;
}

ipq_set_verdict()函數：

static int
ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
{
struct ipq_queue_entry *entry;

if (vmsg->value > NF_MAX_VERDICT)
return -EINVAL;

// 根據包的ID找出以前放入QUEUE隊列中的ipq_queue_entry結構，該結構保存

// 最初的skb包的地址
entry = ipq_find_dequeue_entry(id_cmp, vmsg->id);
if (entry == NULL)
  return -ENOENT;
else {
  int verdict = vmsg->value;

  if (vmsg->data_len && vmsg->data_len == len)
// 如果數據被用戶層修改，將修改後的信息替換skb中原來的信息
   if (ipq_mangle_ipv4(vmsg, entry) < 0)
    verdict = NF_DROP;
// 最終進行裁定
  ipq_issue_verdict(entry, verdict);
  return 0;
}
}

ipq_issue_verdict()函數：

static void
ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
{
// 所有QUEUE的包都要由該函數返回netfilter，在net/core/netfilter.c中定義
nf_reinject(entry->skb, entry->info, verdict);
kfree(entry);
}

5. 結論

ip_queue工具的提供使得很多在內核裏不太容易實現的功能可以放到用戶層空間內實現，處理安全性高，畢竟內核中的錯誤會導致系統崩潰，而用戶層程序的出錯不影響系統的整體運行，當然這是以性能降低爲代價的。

ip_queue隊列實現是使用queue_handler的，queue_handler對於每個協議族只支持一個隊列，所以如果有兩個需要使用queue功能的應用就會發生衝突，如實現QoS的IMQ也使用這個隊列，因此兩者在內核中是不能共存的。

ip_queue的實現分析

985 碩士程序員，空窗 4 個月沒有 Offer！

一文搞懂 Spring 循環依賴

賽博鬥地主——使用大語言模型扮演Agent智能體玩牌類遊戲。

VScode右鍵打開(添加到右鍵)

記一次 .NET某工控視覺自動化系統卡死分析

紅帽子企業版(RedHat Enterprise)各版本內核版本

typedef和#define的定義與區別

linux 內核進程與用戶進程的通信方法一使用sockopt與內核交換數據

架設 DNS 所需要的套件

Linux SNMP OID’s for CPU,Memory and Disk Statistics

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結