一、poll实现
内核中VFS系统的poll机会是整个系统中异步等待的基础,无论是最为原始的poll,之后上流的select,以及最近的epoll,它们在内核中的实现都是基于每种文件提供一个poll接口功能来实现。但是对于内核态的poll来说,它并不是一个阻塞式的接口。它是最为接近poll英文原始意义的接口,所以poll就是一个调查,它并不阻塞。并且当文件真正发生了某些事件,例如可输入、可输出、出现错误等情况发生的时候,文件会非常自觉地将poll中注册过的等待者唤醒。
二、用户态poll
我们看一下用户态的poll接口说明
NAME
poll, ppoll - wait for some event on a file descriptor
SYNOPSIS
#include <poll.h>
int poll(struct pollfd *fds, nfds_t nfds, int timeout);
#define _GNU_SOURCE
#include <poll.h>
int ppoll(struct pollfd *fds, nfds_t nfds,
const struct timespec *timeout, const sigset_t *sigmask);
DESCRIPTION
poll() performs a similar task to select(2): it waits for one of a set
of file descriptors to become ready to perform I/O.用户态的等待是阻塞式等待,必须有文件可操作时才会推出这个系统调用。
The set of file descriptors to be monitored is specified in the fds
argument, which is an array of nfds structures of the following form:
那么假设说希望非阻塞使的判断文件是否可读而不进行阻塞该如何处理呢?这个其实也简单,那就是对已poll的时候传递一个时间为0的参数,从而完成一个真正的即时检测。当然,大家也可以将一个文件设置为非阻塞,从而只是进行尝试性读取操作。
用户态poll进入内核的实现
static int do_poll(unsigned int nfds, struct poll_list *list,
struct poll_wqueues *wait, s64 *timeout)
for (;;) {
struct poll_list *walk;
long __timeout;
set_current_state(TASK_INTERRUPTIBLE);
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) {
/*
* Fish for events. If we found one, record it
* and kill the poll_table, so we don't
* needlessly register any other waiters after
* this. They'll get immediately deregistered
* when we break out and return.
*/
if (do_pollfd(pfd, pt)) {
……
__timeout = schedule_timeout(__timeout);此时按照poll的语义,交出执行权,相当于此时用户态程序已经阻塞,但是在之前通过do_pollfd操作已经将文件挂载到了目标文件的等待队列中,因此当文件可用时,它可以提前返回为可执行状态,从而开始继续执行。
if (*timeout >= 0)
*timeout += __timeout;
}
三、内核态poll
内核态的poll其实是一个文件基本的接口功能,基本到和文件打开、关闭、读取、写入等相同级别的一个基础接口,所有的一些相对有些特殊的文件都可能会定义这个接口。但是一些相对简单的接口是没有这个功能的,例如我们最为基础的ext2文件系统中一个普通的问价,它就没有必要实现一个poll的接口。可以看到,普通的ext2文件系统是没有poll接口的。
const struct file_operations ext2_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
.aio_write = generic_file_aio_write,
.ioctl = ext2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
#endif
.mmap = generic_file_mmap,
.open = generic_file_open,
.release = ext2_release_file,
.fsync = ext2_sync_file,
.sendfile = generic_file_sendfile,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
};
以相对比较简单的pipe 的poll接口来说
static unsigned int
pipe_poll(struct file *filp, poll_table *wait)
{
unsigned int mask;
struct inode *inode = filp->f_path.dentry->d_inode;
struct pipe_inode_info *pipe = inode->i_pipe;
int nrbufs;
poll_wait(filp, &pipe->wait, wait);
/* Reading only -- no need for acquiring the semaphore. */
nrbufs = pipe->nrbufs;
mask = 0;
if (filp->f_mode & FMODE_READ) {
mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
if (!pipe->writers && filp->f_version != pipe->w_counter)
mask |= POLLHUP;
}
if (filp->f_mode & FMODE_WRITE) {
mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
/*
* Most Unices do not set POLLERR for FIFOs but on Linux they
* behave exactly like pipes for poll().
*/
if (!pipe->readers)
mask |= POLLERR;
}
return mask;
}
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && wait_address)
p->qproc(filp, wait_address, p);
}
这里事实上是又回调了等待者提供的qproc接口,因为这里的poll_wait的第三个参数是poll的调用者传入的参数。那么为什么调用者要传入自己的加入等待队列的方法而不能自己来真正完成等待呢?它缺少的是什么呢?
well,明显地,这个最为关键的就是每个文件所特有的等待队列的队列头的地址,这个是每个文件所特有的,所以每个文件要提供自己的等待队列的队列头地址,这个根据不同的文件有不同的内存地址。当知道了这个地址之后,poll的等待就可以按照自己的想法将自己特定的等待结构挂载在指定的位置,从而完成等待。
我们看一下epoll的qproc函数:sys_epoll_ctl 中的EPOLL_CTL_ADD命令
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
struct file *tfile, int fd)
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/*
* Attach the item to the poll hooks and get current event bits.
* We can safely use the file* here because its usage count has
* been increased by the caller of this function.
*/
revents = tfile->f_op->poll(tfile, &epq.pt);
而这个加入队列的操作为
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);再次定制了文件就绪是的唤醒函数。
pwq->whead = whead;
pwq->base = epi;
add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
/* We have to signal that an error occurred */
epi->nwait = -1;
}
}
作为对比,我们看一下select的实现:
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p)
{
struct poll_table_entry *entry = poll_get_entry(p);
if (!entry)
return;
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
init_waitqueue_entry(&entry->wait, current);
add_wait_queue(wait_address,&entry->wait);
}
static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
{
q->flags = 0;
q->private = p;
q->func = default_wake_function;通用的select函数只负责唤醒而不做任何记录,这也是它低效的根本原因。所以select被唤醒之后,它还需要再次poll所有的接口。
}