1 poll的使用
相對於select來說,poll 也是在指定時間內論詢一定數量的文件描述符,來測試其中是否有就緒的,不過,poll 提供了一個易用的方法,來實現 i/o 複用。
聲明如下:
#include <poll.h>
int poll(struct pollfd *fds, nfds_t nfds, int timeout);
其中,struct pollfd 定義爲:
struct pollfd {
int fd; /* file descriptor */
short events; /* requested events */
short revents; /* returned events */
};
fd 爲文件描述符,events 告訴poll 監聽fd 上哪些事件,它是一系列事件按位或。revents 由內核修改,來通知應用程序fd 上實際上發生了哪些事件。
nfds 爲監聽事件集合fds的大小
timeout 爲poll的超時時間,單位毫秒。timeout 爲-1時,poll永遠阻塞,直到有事件發生。timeout爲0時,poll立即返回。
下面舉一個例子來看一下具體如何使用poll來監聽文件的狀態
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <poll.h>
#include <errno.h>
#include <fcntl.h>
#include <arpa/inet.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <unistd.h>
#include <map>
#include <string>
using namespace std;
#define BUFSIZE 10
#define CLIENTSIZE 100
int createSocket()
{
struct sockaddr_in servaddr;
int listenfd = -1;
if (-1 == (listenfd = socket(PF_INET, SOCK_STREAM, 0)))
{
fprintf(stderr, "socket: %d, %s\n", errno, strerror(errno));
exit(1);
}
int reuseaddr = 1;
if (-1 == setsockopt(listenfd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, sizeof(reuseaddr)))
{
fprintf(stderr, "setsockopt: %d, %s\n", errno, strerror(errno));
exit(1);
}
memset(&servaddr, 0, sizeof(servaddr));
servaddr.sin_family = PF_INET;
servaddr.sin_port = htons(8008);
inet_pton(PF_INET, "0.0.0.0", &servaddr.sin_addr);
if (-1 == bind(listenfd, (struct sockaddr*)&servaddr, sizeof(servaddr)))
{
fprintf(stderr, "bind: %d, %s\n", errno, strerror(errno));
exit(1);
}
if (-1 == listen(listenfd, 5))
{
fprintf(stderr, "listen: %d, %s\n", errno, strerror(errno));
exit(1);
}
return listenfd;
}
int setnoblock(int fd)
{
int oldopt = fcntl(fd, F_GETFL);
int newopt = oldopt | O_NONBLOCK;
fcntl(fd, F_SETFL, newopt);
return oldopt;
}
int main()
{
struct pollfd fds[CLIENTSIZE];
int listenfd = createSocket();
map<int, string> mapdata;
fds[0].fd = listenfd;
fds[0].events = POLLIN | POLLERR;//監聽 in和err事件
fds[0].revents = 0;
int conncount = 0;
while (1)
{
int ret = poll(fds, conncount + 1, -1);
if (ret < 0)
{
fprintf(stderr, "poll: %d, %s\n", errno, strerror(errno));
exit(1);
}
for (int i = 0; i < conncount + 1; i++)
{
// 客戶端關閉,或者錯誤。錯誤的原因是由於客戶端關閉導致的,這裏一併處理
if ((fds[i].revents & POLLRDHUP) || (fds[i].revents & POLLERR))
{
int fd = fds[i].fd;
fds[i] = fds[conncount];
i--;
conncount--;
mapdata.erase(fd);
close(fd);
printf("delete connection: %d\n", fd);
}
// 新的連接
else if ((fds[i].fd == listenfd) && (fds[i].revents & POLLIN))
{
struct sockaddr_in client;
socklen_t lenaddr = sizeof(client);
int conn = -1;
if (-1 == (conn = accept(listenfd, (struct sockaddr*)&client, &lenaddr)))
{
fprintf(stderr, "accept: %d, %s\n", errno, strerror(errno));
exit(1);
}
printf("get connection %d from %s:%d\n", conn, inet_ntoa(client.sin_addr), client.sin_port);
conncount++;
setnoblock(conn);
fds[conncount].fd = conn;
fds[conncount].events = POLLIN | POLLRDHUP | POLLERR;
fds[conncount].revents = 0;
}
// 有可讀數據
else if (fds[i].revents & POLLIN)
{
char buf[BUFSIZE] = {0};
int lenrecv = recv(fds[i].fd, buf, BUFSIZE-1, 0);
if (lenrecv > 0)
{
mapdata[fds[i].fd] = buf;
fds[i].events &= (~POLLIN);
fds[i].events |= POLLOUT;
}
else if (lenrecv == 0)
{
printf("------- client %d exit (not print) --------\n", fds[i].fd);
}
else
{
fprintf(stderr, "recv: %d, %s\n", errno, strerror(errno));
exit(1);
}
}
// 可寫數據
else if (fds[i].revents & POLLOUT)
{
if (send(fds[i].fd, mapdata[fds[i].fd].c_str(), mapdata[fds[i].fd].size(), 0) < 0)
{
if (ECONNRESET == errno)
{
continue;
}
fprintf(stderr, "send: %d, %s\n", errno, strerror(errno));
exit(1);
}
fds[i].events &= (~POLLOUT);
fds[i].events |= POLLIN;
}
}
}
}
可以看到設置了監聽事件以後,當進程返回的時候,可以從返回的revents中讀取就緒的事件。
上面內容參考自:
https://www.cnblogs.com/zuofaqi/p/9631601.html
2 poll的原理實現
poll的實現依賴於內核的等待隊列。等待隊列的原理可以參考這篇文章:
https://blog.csdn.net/oqqYuJi12345678/article/details/106304644
應用層調用poll函數的時候,內核爲:
// poll 使用的結構體
struct pollfd {
int fd; // 描述符
short events; // 關注的事件掩碼
short revents; // 返回的事件掩碼
};
// long sys_poll(struct pollfd *ufds, unsigned int nfds, long timeout_msecs)
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
long, timeout_msecs)
{
struct timespec end_time, *to = NULL;
int ret;
if (timeout_msecs >= 0) {
to = &end_time;
// 將相對超時時間msec 轉化爲絕對時間
poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
}
// do sys poll
ret = do_sys_poll(ufds, nfds, to);
// do_sys_poll 被信號中斷, 重新調用, 對使用者來說 poll 是不會被信號中斷的.
if (ret == -EINTR) {
struct restart_block *restart_block;
restart_block = ¤t_thread_info()->restart_block;
restart_block->fn = do_restart_poll; // 設置重啓的函數
restart_block->poll.ufds = ufds;
restart_block->poll.nfds = nfds;
if (timeout_msecs >= 0) {
restart_block->poll.tv_sec = end_time.tv_sec;
restart_block->poll.tv_nsec = end_time.tv_nsec;
restart_block->poll.has_timeout = 1;
} else {
restart_block->poll.has_timeout = 0;
}
// ERESTART_RESTARTBLOCK 不會返回給用戶進程,
// 而是會被系統捕獲, 然後調用 do_restart_poll,
ret = -ERESTART_RESTARTBLOCK;
}
return ret;
}
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec *end_time)
{
struct poll_wqueues table;
int err = -EFAULT, fdcount, len, size;
/* 首先使用棧上的空間,節約內存,加速訪問 */
long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
struct poll_list *const head = (struct poll_list *)stack_pps;
struct poll_list *walk = head;
unsigned long todo = nfds;
if (nfds > rlimit(RLIMIT_NOFILE)) {
// 文件描述符數量超過當前進程限制
return -EINVAL;
}
// 複製用戶空間數據到內核
len = min_t(unsigned int, nfds, N_STACK_PPS);
for (;;) {
walk->next = NULL;
walk->len = len;
if (!len) {
break;
}
...................................................(1)
// 複製到當前的 entries
if (copy_from_user(walk->entries, ufds + nfds-todo,
sizeof(struct pollfd) * walk->len)) {
goto out_fds;
}
todo -= walk->len;
if (!todo) {
break;
}
// 棧上空間不足,在堆上申請剩餘部分
len = min(todo, POLLFD_PER_PAGE);
size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
walk = walk->next = kmalloc(size, GFP_KERNEL);
if (!walk) {
err = -ENOMEM;
goto out_fds;
}
}
// 初始化 poll_wqueues 結構, 設置函數指針_qproc 爲__pollwait
..................................................................(2)
poll_initwait(&table);
// poll
.....................................................................(3)
fdcount = do_poll(nfds, head, &table, end_time);
// 從文件wait queue 中移除對應的節點, 釋放entry.
poll_freewait(&table);
// 複製結果到用戶空間
for (walk = head; walk; walk = walk->next) {
struct pollfd *fds = walk->entries;
int j;
for (j = 0; j < len; j++, ufds++)
.......................................................................(4)
if (__put_user(fds[j].revents, &ufds->revents)) {
goto out_fds;
}
}
err = fdcount;
out_fds:
// 釋放申請的內存
walk = head->next;
while (walk) {
struct poll_list *pos = walk;
walk = walk->next;
kfree(pos);
}
return err;
}
(1)把用戶空間的pollfd 賦值到內核空間,用戶 空間的pollfd包含有設置的需要監聽的fd和events
(2)初始化 poll的等待隊列
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
pwq->polling_task = current;
pwq->triggered = 0;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}
__pollwait函數在調用監控文件的poll函數的時候會被回掉到,用來向驅動函數的等待隊列頭插入該等待隊列。
(3)do_poll的詳細實現,遍歷每個文件描述符實現的poll函數,檢查是否該文件有數據可讀,如果有數據可讀,就返回,否則就把自己插入到驅動的等待隊列頭中睡眠,直到有數據來杯喚醒
(4)把各個文件描述符poll調用得到的返回事件複製到用戶空間,用戶空間據此返回值可以來判斷文件的狀態。
下面接着看do_poll的詳細實現:
// 真正的處理函數
static int do_poll(unsigned int nfds, struct poll_list *list,
struct poll_wqueues *wait, struct timespec *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
unsigned long slack = 0;
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
// 已經超時,直接遍歷所有文件描述符, 然後返回
pt = NULL;
timed_out = 1;
}
if (end_time && !timed_out) {
// 估計進程等待時間,納秒
slack = select_estimate_accuracy(end_time);
}
// 遍歷文件,爲每個文件的等待隊列添加喚醒函數(pollwake)
for (;;) {
struct poll_list *walk;
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) {
// do_pollfd 會向文件對應的wait queue 中添加節點
// 和回調函數(如果 pt 不爲空)
// 並檢查當前文件狀態並設置返回的掩碼
-----------------------------------------------------------------(1)
if (do_pollfd(pfd, pt)) {
// 該文件已經準備好了.
// 不需要向後面文件的wait queue 中添加喚醒函數了.
count++;
pt = NULL;
}
}
}
// 下次循環的時候不需要向文件的wait queue 中添加節點,
// 因爲前面的循環已經把該添加的都添加了
pt = NULL;
// 第一次遍歷沒有發現ready的文件
if (!count) {
count = wait->error;
// 有信號產生
if (signal_pending(current)) {
count = -EINTR;
}
}
-----------------------------------------------------------------------(2)
// 有ready的文件或已經超時
if (count || timed_out) {
break;
}
// 轉換爲內核時間
if (end_time && !to) {
expire = timespec_to_ktime(*end_time);
to = &expire;
}
// 等待事件就緒, 如果有事件發生或超時,就再循
// 環一遍,取得事件狀態掩碼並計數,
// 注意此次循環中, 文件 wait queue 中的節點依然存在
---------------------------------------------------------------------------(3)
if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) {
timed_out = 1;
}
}
return count;
}
(1)調用文件描述符提供的poll方法,可以看到,如果返回的mask不爲0,則表示有數據變化,該mask 值作爲do_pollfd的返回值不爲0,則count++
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
{
unsigned int mask;
int fd;
mask = 0;
fd = pollfd->fd;
if (fd >= 0) {
struct fd f = fdget(fd);
mask = POLLNVAL;
if (f.file) {
mask = DEFAULT_POLLMASK;
if (f.file->f_op && f.file->f_op->poll) {
pwait->_key = pollfd->events|POLLERR|POLLHUP;
mask = f.file->f_op->poll(f.file, pwait);
}
/* Mask out unneeded events. */
mask &= pollfd->events | POLLERR | POLLHUP;
fdput(f);
}
}
pollfd->revents = mask;
return mask;
}
(2)如果count不爲0,或者超時,則poll不會睡眠,直接返回
(3)如果 超時,或者沒有數據變化,則睡眠,由於外圍函數是個for的死循環,所以醒來以後又會遍歷所有的描述符事件,來獲取數據以及決定是否跳出poll方法
從上面的實現可以看出,如果應用層要實現poll這樣的機制,是需要驅動層來提供自己的poll函數的。下面看一下,驅動層如何實現一個簡單的poll方法
3 驅動的poll實現
#include <linux/module.h>
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/cdev.h>
#include <asm/io.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <linux/poll.h>
#include "memdev.h"
static mem_major = MEMDEV_MAJOR;
bool have_data = false; /*表明設備有足夠數據可供讀*/
module_param(mem_major, int, S_IRUGO);
struct mem_dev *mem_devp; /*設備結構體指針*/
struct cdev cdev;
/*文件打開函數*/
int mem_open(struct inode *inode, struct file *filp)
{
struct mem_dev *dev;
/*獲取次設備號*/
int num = MINOR(inode->i_rdev);
if (num >= MEMDEV_NR_DEVS)
return -ENODEV;
dev = &mem_devp[num];
/*將設備描述結構指針賦值給文件私有數據指針*/
filp->private_data = dev;
return 0;
}
/*文件釋放函數*/
int mem_release(struct inode *inode, struct file *filp)
{
return 0;
}
/*讀函數*/
static ssize_t mem_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos)
{
unsigned long p = *ppos;
unsigned int count = size;
int ret = 0;
struct mem_dev *dev = filp->private_data; /*獲得設備結構體指針*/
/*判斷讀位置是否有效*/
if (p >= MEMDEV_SIZE)
return 0;
if (count > MEMDEV_SIZE - p)
count = MEMDEV_SIZE - p;
while (!have_data) /* 沒有數據可讀,考慮爲什麼不用if,而用while */
{
if (filp->f_flags & O_NONBLOCK)
return -EAGAIN;
wait_event_interruptible(dev->inq,have_data);
}
/*讀數據到用戶空間*/
if (copy_to_user(buf, (void*)(dev->data + p), count))
{
ret = - EFAULT;
}
else
{
*ppos += count;
ret = count;
printk(KERN_INFO "read %d bytes(s) from %d\n", count, p);
}
have_data = false; /* 表明不再有數據可讀 */
/* 喚醒寫進程 */
return ret;
}
/*寫函數*/
static ssize_t mem_write(struct file *filp, const char __user *buf, size_t size, loff_t *ppos)
{
unsigned long p = *ppos;
unsigned int count = size;
int ret = 0;
struct mem_dev *dev = filp->private_data; /*獲得設備結構體指針*/
/*分析和獲取有效的寫長度*/
if (p >= MEMDEV_SIZE)
return 0;
if (count > MEMDEV_SIZE - p)
count = MEMDEV_SIZE - p;
/*從用戶空間寫入數據*/
if (copy_from_user(dev->data + p, buf, count))
ret = - EFAULT;
else
{
*ppos += count;
ret = count;
printk(KERN_INFO "written %d bytes(s) from %d\n", count, p);
}
have_data = true; /* 有新的數據可讀 */
/* 喚醒讀進程 */
wake_up(&(dev->inq));
return ret;
}
/* seek文件定位函數 */
static loff_t mem_llseek(struct file *filp, loff_t offset, int whence)
{
loff_t newpos;
switch(whence) {
case 0: /* SEEK_SET */
newpos = offset;
break;
case 1: /* SEEK_CUR */
newpos = filp->f_pos + offset;
break;
case 2: /* SEEK_END */
newpos = MEMDEV_SIZE -1 + offset;
break;
default: /* can't happen */
return -EINVAL;
}
if ((newpos<0) || (newpos>MEMDEV_SIZE))
return -EINVAL;
filp->f_pos = newpos;
return newpos;
}
unsigned int mem_poll(struct file *filp, poll_table *wait)
{
struct mem_dev *dev = filp->private_data;
unsigned int mask = 0;
/*將等待隊列添加到poll_table */
poll_wait(filp, &dev->inq, wait);
if (have_data) mask |= POLLIN | POLLRDNORM; /* readable */
return mask;
}
/*文件操作結構體*/
static const struct file_operations mem_fops =
{
.owner = THIS_MODULE,
.llseek = mem_llseek,
.read = mem_read,
.write = mem_write,
.open = mem_open,
.release = mem_release,
.poll = mem_poll,
};
/*設備驅動模塊加載函數*/
static int memdev_init(void)
{
int result;
int i;
dev_t devno = MKDEV(mem_major, 0);
/* 靜態申請設備號*/
if (mem_major)
result = register_chrdev_region(devno, 2, "memdev");
else /* 動態分配設備號 */
{
result = alloc_chrdev_region(&devno, 0, 2, "memdev");
mem_major = MAJOR(devno);
}
if (result < 0)
return result;
/*初始化cdev結構*/
cdev_init(&cdev, &mem_fops);
cdev.owner = THIS_MODULE;
cdev.ops = &mem_fops;
/* 註冊字符設備 */
cdev_add(&cdev, MKDEV(mem_major, 0), MEMDEV_NR_DEVS);
/* 爲設備描述結構分配內存*/
mem_devp = kmalloc(MEMDEV_NR_DEVS * sizeof(struct mem_dev), GFP_KERNEL);
if (!mem_devp) /*申請失敗*/
{
result = - ENOMEM;
goto fail_malloc;
}
memset(mem_devp, 0, sizeof(struct mem_dev));
/*爲設備分配內存*/
for (i=0; i < MEMDEV_NR_DEVS; i++)
{
mem_devp[i].size = MEMDEV_SIZE;
mem_devp[i].data = kmalloc(MEMDEV_SIZE, GFP_KERNEL);
memset(mem_devp[i].data, 0, MEMDEV_SIZE);
/*初始化等待隊列*/
init_waitqueue_head(&(mem_devp[i].inq));
//init_waitqueue_head(&(mem_devp[i].outq));
}
return 0;
fail_malloc:
unregister_chrdev_region(devno, 1);
return result;
}
/*模塊卸載函數*/
static void memdev_exit(void)
{
cdev_del(&cdev); /*註銷設備*/
kfree(mem_devp); /*釋放設備結構體內存*/
unregister_chrdev_region(MKDEV(mem_major, 0), 2); /*釋放設備號*/
}
MODULE_AUTHOR("David Xie");
MODULE_LICENSE("GPL");
module_init(memdev_init);
module_exit(memdev_exit);
其中poll_wait是調用poll_table初始化時賦值的__pollwait函數,該函數爲__pollwait,把當前進程插入到該驅動提供的等待隊列頭中,最後數據到達時,一般會由該驅動來喚醒等待隊列頭中的進程
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && p->_qproc && wait_address)
p->_qproc(filp, wait_address, p);
}