epoll源碼探祕(epoll_create)
epoll系列的系統函數,很簡單,但是很強大。epoll_create(),epoll_ctl() , epoll_wait(),三個就夠了。
一些重要的結構:
/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the "rbr" RB tree.
* Avoid increasing the size of this struct, there can be many thousands
* of these on a server and we do not want this to take another cache line.(紅黑樹單節點)
*/
struct epitem {
union {
/* RB tree node links this structure to the eventpoll RB tree */
struct rb_node rbn;
/* Used to free the struct epitem */
struct rcu_head rcu;
};
/* List header used to link this structure to the eventpoll ready list (列表頭用於連接結構的eventpoll就緒列表)*/
struct list_head rdllink;
/*
* Works together "struct eventpoll"->ovflist in keeping the
* single linked chain of items.
*/
struct epitem *next;
/* The file descriptor information this item refers to (關聯的文件描述符)*/
struct epoll_filefd ffd;
/* Number of active wait queue attached to poll operations (輪詢操作)*/
int nwait;
/* List containing poll wait queues */
struct list_head pwqlist;
/* The "container" of this item */
struct eventpoll *ep;
/* List header used to link this item to the "struct file" items list */
struct list_head fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source __rcu *ws;
/* The structure that describe the interested events and the source fd (感興趣的監控文件描述符的狀態)*/
struct epoll_event event;
};
/*
* This structure is stored inside the "private_data" member of the file
* structure and represents(表現)the main data structure for the eventpoll
* interface.
*/
struct eventpoll {
/* Protect the access to this structure */
spinlock_t lock;
/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
* collection loop, the file cleanup path, the epoll file exit
* code and the ctl operations.
*/
struct mutex mtx;
/* Wait queue used by sys_epoll_wait() (雙鏈表,epoll文件的等待隊列。
*調用epoll_wait的進程可能在此隊列上睡眠, 等待ep_poll_callback()函數喚醒或超時
*/
wait_queue_head_t wq;
/* Wait queue used by file->poll() (雙鏈表, poll_wait是eventpoll文件本身的喚醒隊列,
*該隊列上睡眠的進程是等待eventpoll文件本身的某些事件發生
*/
wait_queue_head_t poll_wait;
/* List of ready file descriptors (就緒鏈表)*/
struct list_head rdllist;
/* RB tree root used to store monitored fd structs (存儲監聽文件描述符結構紅黑樹根節點)*/
struct rb_root rbr;
/*
* This is a single linked list(單鏈表) that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
* holding ->lock.(如果正在向用戶空間傳遞事件,此時狀態就緒的文件描述符相關的結構會暫時放在該隊列上,
* 否則會直接添加到就緒隊列rdllist中。)
*/
struct epitem *ovflist;
/* wakeup_source used when ep_scan_ready_list is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
struct user_struct *user;
struct file *file;
/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
};
epoll_create函數:
創建一個epoll的句柄。需要注意的是,當創建好epoll句柄後,它就是會佔用一個fd值,在linux下如果查看/proc/進程id/fd/,是能夠看到這個fd的,所以在使用完epoll後,必須調用close()關閉,否則可能導致fd被耗盡。
int epoll_create(int size);
int epoll_create1(int flags);
第一級:epoll_create()(注意在Linux 2.6.8之後,size參數是被忽略的)
第二級: epoll_create1()
第三級:ep_alloc()創建內部數據(eventpoll)
在ep_alloc()中
1.初始化epoll文件等待隊列(雙向鏈表)
2.初始化eventpoll文件喚醒隊列(雙向鏈表)
3.初始化就緒隊列(雙向鏈表)
static inline void INIT_LIST_HEAD(struct list_head *list)
{
list->next = list;
list->prev = list;
}
4.初始化紅黑樹根節點
#define RB_ROOT (struct rb_root) { NULL, }
ep->rbr = RB_ROOT;
5.初始化發生事件紅黑樹節點鏈表(單鏈表)
#define EP_UNACTIVE_PTR ((void *) -1L)
ep->ovflist = EP_UNACTIVE_PTR;
第三級:get_unused_fd_flags()獲取一個空閒的文件描述符
第三級:anon_inode_getfile()創建一個匿名文件
第三級:fd_install()將文件與fd建立聯繫
/*
* Open an eventpoll file descriptor.
*/
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
int error, fd;
struct eventpoll *ep = NULL;
struct file *file;
/* Check the EPOLL_* constant for consistency(符合條件EPOLL_CLOEXEC != O_CLOEXEC就報錯). */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
/*
* Create the internal(內部) data structure ("struct eventpoll").
*/
error = ep_alloc(&ep);
if (error < 0)
return error;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*(分配eventpoll實例並初始化,存儲在file結構的private_data成員中。
* private_data成員用來存儲文件描述符真正對應的對象。例如
* 如果文件描述符是一個套接字的話,其對應的file實例的private_data
* 成員存儲的就是一個socket實例。)
*/
fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
if (fd < 0) {
error = fd;
goto out_free_ep;
}
file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
ep->file = file;
fd_install(fd, file);
return fd;
out_free_fd:
put_unused_fd(fd);
out_free_ep:
ep_free(ep);
return error;
}
SYSCALL_DEFINE1(epoll_create, int, size)
{
if (size <= 0)
return -EINVAL;
return sys_epoll_create1(0);
}