select/poll/epoll分析：區別與聯繫

IO模型中一個重要的多路處理模型。

背景：

程序需要處理多路IO時，靠阻塞的同步IO或者非阻塞的輪詢都不是太好的選擇。

因爲阻塞IO只能處理單路IO比較有效，而非阻塞的輪詢無論是否有IO到來都會形成開銷。

因此需要一種事件推動的模型，能對多路IO的就緒狀態進行監聽。類型於硬件中斷驅動機制。

select/poll/epoll便於用於這個目的。

比較：

	特點	問題點
select	用數組的方式指定監聽的多路IO	有最大監聽數量的限制，最大1024
poll	用鏈表來指定監聽的多路IO	1.解決了select監聽數量限制 2.用戶需要遍歷所有的IO，才能找到就緒的IO，開銷是O(n). 3.每次拷貝要監聽的IO數據開銷。 4.內核也要遍歷所有的IO，才能找到就緒的IO，開銷是O(n)
epoll	監聽IO集合單獨指定，返回就緒的IO集合	1.去掉了不必要的多次拷貝要監聽的IO數據開銷。 2.解決了遍歷所有的IO的0(n)開銷。有就緒的IO就單獨通過回調函數把自己加入就緒IO集合中。

從上表看起來，一個比一個要好。epoll似乎是最優美的，沒有任何冗餘的操作與不必要的限制。

具體內核代碼分析：

fs/select.c
int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
...

for (;;) {

...
    for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {  遍歷所有的IO
    if (f_op && f_op->poll) {
                        wait_key_set(wait, in, out,
                                 bit, busy_flag);
                        mask = (*f_op->poll)(f.file, wait);
                    }
    }
    if (retval || timed_out || signal_pending(current))
            break;
        if (table.error) {
            retval = table.error;
            break;
        }
    if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
                       to, slack))
            timed_out = 1;

}

static int do_poll(unsigned int nfds, struct poll_list *list,
struct poll_wqueues *wait, struct timespec *end_time)
{

...

for (;;) {
... for (walk = list; walk != NULL; walk = walk->next) { 遍歷所有的IO
struct pollfd * pfd, * pfd_end;

           pfd = walk->entries;
           pfd_end = pfd + walk->len;
           for (; pfd != pfd_end; pfd++) {
               if (do_pollfd(pfd, pt, &can_busy_loop,
                   busy_flag)) {
                   count++;
                   pt->_qproc = NULL;
                   /* found something, stop busy polling */
                   busy_flag = 0;
                   can_busy_loop = false;
               }
           }
       }

pt->_qproc = NULL;
       if (!count) {
           count = wait->error;
           if (signal_pending(current))
               count = -EINTR;
       }
       if (count || timed_out)
           break;

       /* only if found POLL_BUSY_LOOP sockets && not out of time */
       if (can_busy_loop && !need_resched()) {
           if (!busy_end) {
               busy_end = busy_loop_end_time();
               continue;
           }
           if (!busy_loop_timeout(busy_end))
               continue;
       }
       busy_flag = 0;
       if (end_time && !to) {
           expire = timespec_to_ktime(*end_time);
           to = &expire;
       }

       if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
           timed_out = 1;
   }

}

fs/eventpoll.c

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)

{

fetch_events:
spin_lock_irqsave(&ep->lock, flags);

if (!ep_events_available(ep)) { //判斷是否有就緒IO

for (;;) {

           set_current_state(TASK_INTERRUPTIBLE);
           if (ep_events_available(ep) || timed_out) //判斷是否有就緒IO
               break;
           if (signal_pending(current)) {
               res = -EINTR;
               break;
           }

           spin_unlock_irqrestore(&ep->lock, flags);
           if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
               timed_out = 1;

spin_lock_irqsave(&ep->lock, flags);
}

}

static inline int ep_events_available(struct eventpoll *ep)
{
return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; //判斷是否有就緒IO，是看rdlist是否爲空或者有異常
}

在add 新的epoll時會將wakeup的默認處理回調設置爲自定義的ep_poll_callback

static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                 poll_table *pt)
{
    struct epitem *epi = ep_item_from_epqueue(pt);
    struct eppoll_entry *pwq;

    if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
        pwq->whead = whead;
        pwq->base = epi;
        if (epi->event.events & EPOLLEXCLUSIVE)
            add_wait_queue_exclusive(whead, &pwq->wait);
        else
            add_wait_queue(whead, &pwq->wait);
        list_add_tail(&pwq->llink, &epi->pwqlist);
        epi->nwait++;
    } else {
        /* We have to signal that an error occurred */
        epi->nwait = -1;
    }
}

接下來，肯定不會猜錯，會在回調中把就緒IO加入rdlist.

static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
...

   /* If this file is already in the ready list we exit soon */
   if (!ep_is_linked(&epi->rdllink)) {
       list_add_tail(&epi->rdllink, &ep->rdllist);
       ep_pm_stay_awake_rcu(epi);
   }

...

}

select/poll幾乎是一樣的，只是接口方式有些不一樣，運行原理上相同，而epoll在運行原理上是不同的。

關於驅動：

驅動實現就比較簡單了，因爲它只是框架的一部分，就是把調用 poll_wait(）把自己放入隊列,並返回事件信息，然後在事情發生時調用wakeup().

樣例：

static unsigned int bt_bmc_poll(struct file *file, poll_table *wait)
{
    struct bt_bmc *bt_bmc = file_bt_bmc(file);
    unsigned int mask = 0;
    u8 ctrl;

    poll_wait(file, &bt_bmc->queue, wait);

    ctrl = bt_inb(bt_bmc, BT_CTRL);

    if (ctrl & BT_CTRL_H2B_ATN)
        mask |= POLLIN;

    if (!(ctrl & (BT_CTRL_H_BUSY | BT_CTRL_B2H_ATN)))
        mask |= POLLOUT;

    return mask;
}

static irqreturn_t bt_bmc_irq(int irq, void *arg)
{
    struct bt_bmc *bt_bmc = arg;
    u32 reg;

    reg = ioread32(bt_bmc->base + BT_CR2);
    reg &= BT_CR2_IRQ_H2B | BT_CR2_IRQ_HBUSY;
    if (!reg)
        return IRQ_NONE;

    /* ack pending IRQs */
    iowrite32(reg, bt_bmc->base + BT_CR2);

    wake_up(&bt_bmc->queue);
    return IRQ_HANDLED;
}

總結：

1.所以說epoll真的是event事件驅動，O(1)的效率。select/poll是遍歷找到事件在哪裏。

2. 但是此處在性能上又有類似中斷與polling的特點。在大量事件產生時，interrupt的處理流程開銷必然不變，同時對錶的操作要加lock機制上又有開銷，造成單個事件處理開銷並要比select/poll要高。在數量起來後，反而在性能上epoll不一定比select/poll好。
因此：看情況來選擇，而不是epoll萬能適用。對於少量的多路IO其實都還是可以的，不用太糾結哪個一定好。對於大數量的多路，而事件不太多的情況倒是最適用的。

select/poll/epoll分析：區別與聯繫

一個簡單的MD5加鹽

C# 代碼學習

藍橋15屆stema編程題密碼鎖-動態規劃 C++和Python最後一道題

2021看雪SDC議題回顧 | SaTC：一種全新的物聯網設備漏洞自動化挖掘方法

Kafka存儲機制

aws語音呼叫調用，告警電話

【轉】[C#] WebAPI 防止併發調用二（冥等性）

C#/.NET/.NET Core優秀項目和框架2024年4月簡報

HTTP URL 詳解

得物 ZooKeeper SLA 也可以 99.99%

設計模式：結構小總結

uio(universal input/output)協議設想（3）紫外線（UVA,UVB）強度測量的手機外設

select/poll/epoll分析：區別與聯繫

新視角，改變自己也是一種改變世界

面試的一些小結

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結