• poll(2) 源码分析


    poll(2)

    poll(2) 系统调用的功能和 select(2) 类似:等待一个文件集合中的文件描述符就绪进行I/O操作。

    select(2) 的局限性:

    • 关注的文件描述符集合大小最大只有 1024
    • 文件描述符集合为顺序的,不能任意指定 fd,浪费占用的fd

    poll(2) 对 select(2) 的改进,关注的文件描述符集合为动态大小,文件描述可以任意指定。

    struct pollfd {
           int   fd;         /* file descriptor */
           short events;     /* requested events */
           short revents;    /* returned events */
    };
    
    - fd 为关注的文件描述符
    - events 为关注的事件(输入),使用位掩码来表示事件
    - revents 为就绪的事件(输出),同样使用位掩码表示
    
    #include <poll.h>
    
    int poll(struct pollfd *fds, nfds_t nfds, int timeout);
    
    - fds 为文件描述符集合的地址
    - 
    fds 为文件描述符集合的长度
    - 	imeout 为超时的时间,单位为 毫秒
    
    返回值为 revents 不为 0 的个数,出错返回 -1
    

    一个简单的例子:等待标准输入就绪,超时时间为3s。

    #include <poll.h>
    #include <unistd.h>
    #include <stdio.h>
    
    int main()
    {
            int timeout = 3000;
    
            struct pollfd fds = {0};
            fds.events |= POLLIN;  // fd = 0 等待标准输入
    
            int ret = poll(&fds, 1, timeout);
            if (ret == -1)
                    printf("error poll
    ");
            else if (ret)
                    printf("data is avaliable now.
    ");
            else
                    printf("no data within 3000 ms.
    ");
    
    }
    

    实现

    代码位于在 fs/select.c 中,参考中的链接有一些关于文件回调和poll结构的说明

    poll()

    SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
                    int, timeout_msecs)
    {
            struct timespec64 end_time, *to = NULL;
            int ret;
    
            if (timeout_msecs >= 0) {
                    to = &end_time;
                    poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
                            NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
            }
    
            ret = do_sys_poll(ufds, nfds, to);
    
            if (ret == -EINTR) {
                    struct restart_block *restart_block;
    
                    restart_block = &current->restart_block;
                    restart_block->fn = do_restart_poll;
                    restart_block->poll.ufds = ufds;
                    restart_block->poll.nfds = nfds;
    
                    if (timeout_msecs >= 0) {
                            restart_block->poll.tv_sec = end_time.tv_sec;
                            restart_block->poll.tv_nsec = end_time.tv_nsec;
                            restart_block->poll.has_timeout = 1;
                    } else
                            restart_block->poll.has_timeout = 0;
    
                    ret = -ERESTART_RESTARTBLOCK;
            }
            return ret;
    }
    

    poll() 代码很简单:

    1. 处理超时时间
    2. 实现 poll(2)
    3. 处理后事:判断是否超时或者重新调用。

    do_sys_poll()

    
    static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                    struct timespec64 *end_time)
    {
            struct poll_wqueues table;
             int err = -EFAULT, fdcount, len, size;
            /* Allocate small arguments on the stack to save memory and be
               faster - use long to make sure the buffer is aligned properly
               on 64 bit archs to avoid unaligned access */
            long stack_pps[POLL_STACK_ALLOC/sizeof(long)];  // 256 字节大小
            struct poll_list *const head = (struct poll_list *)stack_pps;
             struct poll_list *walk = head;
             unsigned long todo = nfds;
    
            if (nfds > rlimit(RLIMIT_NOFILE))  // 最大打开的文件数量限制
                    return -EINVAL;
    
            // N_STACK_PPS = (256 - 16) / 8 = 30, 栈空间可以保存 30 个pollfd结构
            // 将用户空间的 struct pollfd 部分移动至栈空间内的数组中
            len = min_t(unsigned int, nfds, N_STACK_PPS);
            for (;;) {
                    walk->next = NULL;
                    walk->len = len;
                    if (!len)
                            break;
    
                    if (copy_from_user(walk->entries, ufds + nfds-todo,
                                            sizeof(struct pollfd) * walk->len))
                            goto out_fds;
    
                    todo -= walk->len;
                    if (!todo)
                            break;
    
                    // POLLFD_PER_PAGE = (4096 - 16) / 8 = 510
                    // 申请页,每页可容纳 510 个 pollfd 结构
                    len = min(todo, POLLFD_PER_PAGE);
                    size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
                    walk = walk->next = kmalloc(size, GFP_KERNEL);
                    if (!walk) {
                            err = -ENOMEM;
                            goto out_fds;
                    }
            }
            // 将所有的pollfd 结构移动至以 head 为首地址的内核空间中
    
            poll_initwait(&table);  // 初始化 table,详见 select 中的分析,见下参考
            fdcount = do_poll(head, &table, end_time);
            poll_freewait(&table);  // 释放 table
    
            // 将 revents 复制到用户空间
            for (walk = head; walk; walk = walk->next) {
                    struct pollfd *fds = walk->entries;
                    int j;
    
                    for (j = 0; j < walk->len; j++, ufds++)
                            if (__put_user(fds[j].revents, &ufds->revents))
                                    goto out_fds;
              }
    
            err = fdcount;
    out_fds:
            walk = head->next;
            while (walk) {
                    struct poll_list *pos = walk;
                    walk = walk->next;
                    kfree(pos);
            }
    
            return err;
    }
    

    do_sys_poll() 函数也是分为三步实现

    1. 将用户空间的数据复制到内核空间
    2. 调用核心实现 do_poll()
    3. 将就绪的事件数据从内核空间复制到用户空间

    do_poll()

    static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
                       struct timespec64 *end_time)
    {
            poll_table* pt = &wait->pt;
            ktime_t expire, *to = NULL;
            int timed_out = 0, count = 0;
            u64 slack = 0;
            __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
            unsigned long busy_start = 0;
    
            /* Optimise the no-wait case */
            if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                    pt->_qproc = NULL;
                    timed_out = 1;
            }
    
            if (end_time && !timed_out)
                    slack = select_estimate_accuracy(end_time);  // 估算进程等待的时间,函数返回 纳秒
    
            for (;;) {
                    struct poll_list *walk;
                    bool can_busy_loop = false;
    
                    for (walk = list; walk != NULL; walk = walk->next) {
                            struct pollfd * pfd, * pfd_end;
    
                            pfd = walk->entries;
                            pfd_end = pfd + walk->len;
                            for (; pfd != pfd_end; pfd++) {  // 对所有的 struct pollfd 遍历处理,do_pollfd 为单独处理一个 fd 的函数
                                    /*
                                     * Fish for events. If we found one, record it
                                     * and kill poll_table->_qproc, so we don't
                                     * needlessly register any other waiters after
                                     * this. They'll get immediately deregistered
                                     * when we break out and return.
                                     */
                                    if (do_pollfd(pfd, pt, &can_busy_loop,
                                                  busy_flag)) {
                                            count++;
                                            pt->_qproc = NULL;
                                            /* found something, stop busy polling */
                                            busy_flag = 0;
                                            can_busy_loop = false;
                                    }
                            }
                    }
                    /*
                     * All waiters have already been registered, so don't provide
                     * a poll_table->_qproc to them on the next loop iteration.
                     */
                    pt->_qproc = NULL;
                    if (!count) {
                            count = wait->error;
                            if (signal_pending(current))
                                    count = -EINTR;
                    }
                    if (count || timed_out)
                            break;
    
                    /* only if found POLL_BUSY_LOOP sockets && not out of time */
                    if (can_busy_loop && !need_resched()) {
                            if (!busy_start) {
                                    busy_start = busy_loop_current_time();
                                    continue;
                            }
                            if (!busy_loop_timeout(busy_start))
                                    continue;
                    }
                    busy_flag = 0;
    
                    /*
                     * If this is the first loop and we have a timeout
                     * given, then we convert to ktime_t and set the to
                     * pointer to the expiry value.
                     */
                    if (end_time && !to) {
                            expire = timespec64_to_ktime(*end_time);
                            to = &expire;
                    }
    
                    if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))  // 调度直到超时
                            timed_out = 1;
            }
            return count;
    }
    

    这个函数写的很清楚了,也有很多注释

    1. can_busy_loop 是和 CONFIG_NET_RX_BUSY_POLL 配置相关的,不算通用处理情况,先忽略不考虑
    2. count 为函数的返回值,在 do_pollfd 有返回匹配的掩码时递增,为就绪的文件描述符数量,无就绪文件的时候为等待队列中的错误码
    3. pt->_qproc 为文件poll操作调用的函数,= NULL 的操作在注释中已经说明,函数已经注册到队列中,不必再次注册. 这个函数相关的内容可以在另外一篇 select(2) 找到具体的说明
    /*
     * Fish for events. If we found one, record it and kill poll_table->_qproc, so we don't
     * needlessly register any other waiters after this. They'll get immediately deregistered
     * when we break out and return.
     */
    
    /*
     * All waiters have already been registered, so don't provide a poll_table->_qproc to them on the next loop iteration.
     */
    

    do_pollfd()

    /*
     * Fish for pollable events on the pollfd->fd file descriptor. We're only
     * interested in events matching the pollfd->events mask, and the result
     * matching that mask is both recorded in pollfd->revents and returned. The
     * pwait poll_table will be used by the fd-provided poll handler for waiting,
     * if pwait->_qproc is non-NULL.
     */
    static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
                                         bool *can_busy_poll,
                                         __poll_t busy_flag)
    {
            __poll_t mask;
            int fd;
    
            mask = 0;
            fd = pollfd->fd;
            if (fd >= 0) {
                    struct fd f = fdget(fd);
                    mask = EPOLLNVAL;  // 0x20
                    if (f.file) {
                            /* userland u16 ->events contains POLL... bitmap */
                            // 设置关注的事件
                            __poll_t filter = demangle_poll(pollfd->events) |
                                                    EPOLLERR | EPOLLHUP;
                            mask = DEFAULT_POLLMASK;  // (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)
                            if (f.file->f_op->poll) {
                                    pwait->_key = filter;
                                    pwait->_key |= busy_flag;  // key 在唤醒函数的时候用到
                                    mask = f.file->f_op->poll(f.file, pwait);  // 获取就绪的文件掩码
                                    if (mask & busy_flag)
                                            *can_busy_poll = true;
                            }
                            /* Mask out unneeded events. */
                            mask &= filter;  // 将文件返回的事件掩码与关注的事件做与操作得到 关注的就绪事件掩码
                            fdput(f);
                    }
            }
            /* ... and so does ->revents */
            pollfd->revents = mangle_poll(mask);  // 设置就绪掩码
    
            return mask;
    }
    

    讨论在不考虑错误的情况下,
    poll(2) 返回的是revents 非 0 的个数,在 do_pollfd() 中返回一个非 0 的 mask,poll(2) 返回的 count 就 +1。
    mask = 0 有两种可能:

    1. 和 filter 做与运算,但是这样做有一个前提就是可以取到 fd
    2. fd < 0,这种属于无意义的fd了,属于用户的问题

    在已了解的fd中: eventfd 和普通的文件poll函数返回情况

    • EPOLLIN 或者 EPOLLOUT 或两个都存在
    • (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)

    当关注的事件不在以上事件中,是可能返回 0,而count不增加的

    struct pollfd fds[n];
    rn = poll(fds, n, 0);
    for (int i = 0; i < rn; ++i)
            if (fds[i].revents ...)
    

    像上面这种操作是有风险的,会访问不到rn之后的fd。

    mangle_poll() 设置就绪掩码

    展开一下 就绪掩码的设置函数, __MAP 函数有点绕, 大概就是将 v & from 转换至靠近 to 大小的数值,没太明白为什么这么做。在 4.17 内核中 POLLIN 和 EPOLLIN 这类宏定义大小是一样的。

    #define __MAP(v, from, to) 
            (from < to ? (v & from) * (to/from) : (v & from) / (from/to))
    
    static inline __poll_t demangle_poll(u16 val) {
        return (__force __poll_t)__MAP(val, POLLIN, (__force __u16)EPOLLIN) |
               (__force __poll_t)__MAP(val, POLLOUT, (__force __u16)EPOLLOUT) |
               (__force __poll_t)__MAP(val, POLLPRI, (__force __u16)EPOLLPRI) |
               (__force __poll_t)__MAP(val, POLLERR, (__force __u16)EPOLLERR) |
               (__force __poll_t)__MAP(val, POLLNVAL, (__force __u16)EPOLLNVAL) |
               (__force __poll_t)__MAP(val, POLLRDNORM,
                                       (__force __u16)EPOLLRDNORM) |
               (__force __poll_t)__MAP(val, POLLRDBAND,
                                       (__force __u16)EPOLLRDBAND) |
               (__force __poll_t)__MAP(val, POLLWRNORM,
                                       (__force __u16)EPOLLWRNORM) |
               (__force __poll_t)__MAP(val, POLLWRBAND,
                                       (__force __u16)EPOLLWRBAND) |
               (__force __poll_t)__MAP(val, POLLHUP, (__force __u16)EPOLLHUP) |
               (__force __poll_t)__MAP(val, POLLRDHUP, (__force __u16)EPOLLRDHUP) |
               (__force __poll_t)__MAP(val, POLLMSG, (__force __u16)EPOLLMSG);
    }
    

    参考

    select 源码分析,上一篇写的关于 select 的分析,有一些关于 poll 结构和文件回调的分析。

  • 相关阅读:
    Idea快捷键---根据自己使用情况持续更新
    JVM 性能监控 工具
    redis ---RDB 和 AOF 持久策略对比
    数组、链表等常用数据结构和集合浅解(java)
    关于界面刷新嵌套展示(form标签 target 属性)问题
    对象是否存在的判定方法
    数据库大量插入数据的sql 优化
    Java集合之LinkedList
    Java集合类之ArrayList
    Java并发程序基础
  • 原文地址:https://www.cnblogs.com/shuqin/p/11662645.html
Copyright © 2020-2023  润新知