• epoll oneshot


    /* Epoll private bits inside the event mask */
    #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)

    主要是看下:惊群源:

    1、socket wake_up 

    2、epoll_wait 中wake_up 

    目前data ready的时候调用sk_data_ready 唤醒进程,此时唤醒进程选择了  只唤醒一个

    / nr_exclusive是1  
      
    static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,  
      
    int nr_exclusive, int wake_flags, void *key)  
      
    {  
      
    wait_queue_t *curr, *next;  
      
    list_for_each_entry_safe(curr, next, &q->task_list, task_list) {  
      
    unsigned flags = curr->flags;  
      
    if (curr->func(curr, mode, wake_flags, key) &&  
      
    (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)  
      
    break;  
      
    }  
      
    }  
    View Code

    (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

    传进来的nr_exclusive是1, 所以flags & WQ_FLAG_EXCLUSIVE为真的时候,执行一次,就会跳出循环。

    Epoll_create()在fork子进程之前

    所有进程都共享一个 epfd, 所以data ready 唤醒进程的时候即使加上 nr_exclusive = 1 只唤醒一个进程, 那么唤醒那个一个呢?

    也就是当连接到来时,我们需要选择一个进程来accept,这个时候,任何一个accept都是可以的。当连接建立以后,后续的读写事件,却与进程有了关联。一个请求与a进程建立连接后,后续的读写也应该由a进程来做。

    当读写事件发生时,应该通知哪个进程呢?Epoll并不知道,因此,事件有可能错误通知另一个进程处理

    Epoll_create()在fork子进程之后

    每个进程的读写事件,只注册在自己进程的epoll中。所以不会出现竞争

    但是accept呢???

    目前有的内核版本说是会出现有的不会!!!

    这就需要看内核版本实现了 无非就是唤醒的时候加上一些标志。。。当然是用reuseport 一劳永逸!!

    如果不是是用reuseport实现只唤醒一个进程,那么wake_up的时候就是唤醒等待队列的头一个。。那怎么做到负载均衡呢???

    所以还是reuseport好!!!!

      */
     static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
                 int nr_exclusive, int wake_flags, void *key)
     {
         wait_queue_t *curr, *next;
     
         list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
             unsigned flags = curr->flags;
     
             if (curr->func(curr, mode, wake_flags, key) &&
                     (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
                 break;
         }
     }
    
    void __wake_up(wait_queue_head_t *q, unsigned int mode,
                int nr_exclusive, void *key)
    {
        unsigned long flags;
    
        spin_lock_irqsave(&q->lock, flags);
        __wake_up_common(q, mode, nr_exclusive, 0, key);
        spin_unlock_irqrestore(&q->lock, flags);
    }
    
    
    /*
     * This is the callback that is passed to the wait queue wakeup
     * machanism. It is called by the stored file descriptors when they
     * have events to report.
     */
    static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
    {
        int pwake = 0;
        unsigned long flags;
        struct epitem *epi = ep_item_from_wait(wait);
        struct eventpoll *ep = epi->ep;
    
        spin_lock_irqsave(&ep->lock, flags);
    
        /*
         * If the event mask does not contain any poll(2) event, we consider the
         * descriptor to be disabled. This condition is likely the effect of the
         * EPOLLONESHOT bit that disables the descriptor when an event is received,
         * until the next EPOLL_CTL_MOD will be issued.
         */
        if (!(epi->event.events & ~EP_PRIVATE_BITS))
            goto out_unlock;
    
        /*
         * Check the events coming with the callback. At this stage, not
         * every device reports the events in the "key" parameter of the
         * callback. We need to be able to handle both cases here, hence the
         * test for "key" != NULL before the event match test.
         */
        if (key && !((unsigned long) key & epi->event.events))
            goto out_unlock;
    
        /*
         * If we are trasfering events to userspace, we can hold no locks
         * (because we're accessing user memory, and because of linux f_op->poll()
         * semantics). All the events that happens during that period of time are
         * chained in ep->ovflist and requeued later on.
         */
        if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
            if (epi->next == EP_UNACTIVE_PTR) {
                epi->next = ep->ovflist;
                ep->ovflist = epi;
            }
            goto out_unlock;
        }
    
        /* If this file is already in the ready list we exit soon */
        if (!ep_is_linked(&epi->rdllink))
            list_add_tail(&epi->rdllink, &ep->rdllist);
    
        /*
         * Wake up ( if active ) both the eventpoll wait list and the ->poll()
         * wait list.
         */
        if (waitqueue_active(&ep->wq))
            wake_up_locked(&ep->wq);
        if (waitqueue_active(&ep->poll_wait))
            pwake++;
    
    out_unlock:
        spin_unlock_irqrestore(&ep->lock, flags);
    
        /* We have to call this outside the lock */
        if (pwake)
            ep_poll_safewake(&ep->poll_wait);
    
        return 1;
    }
    
    
    
    
    static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
    {
        int pwake = 0;
        unsigned long flags;
        struct epitem *epi = ep_item_from_wait(wait);
        struct eventpoll *ep = epi->ep;
        int ewake = 0;
    
        if ((unsigned long)key & POLLFREE) {
            ep_pwq_from_wait(wait)->whead = NULL;
            /*
             * whead = NULL above can race with ep_remove_wait_queue()
             * which can do another remove_wait_queue() after us, so we
             * can't use __remove_wait_queue(). whead->lock is held by
             * the caller.
             */
            list_del_init(&wait->task_list);
        }
    
        spin_lock_irqsave(&ep->lock, flags);
    
        /*
         * If the event mask does not contain any poll(2) event, we consider the
         * descriptor to be disabled. This condition is likely the effect of the
         * EPOLLONESHOT bit that disables the descriptor when an event is received,
         * until the next EPOLL_CTL_MOD will be issued.
         */
        if (!(epi->event.events & ~EP_PRIVATE_BITS))
            goto out_unlock;
    
        /*
         * Check the events coming with the callback. At this stage, not
         * every device reports the events in the "key" parameter of the
         * callback. We need to be able to handle both cases here, hence the
         * test for "key" != NULL before the event match test.
         */
        if (key && !((unsigned long) key & epi->event.events))
            goto out_unlock;
    
        /*
         * If we are transferring events to userspace, we can hold no locks
         * (because we're accessing user memory, and because of linux f_op->poll()
         * semantics). All the events that happen during that period of time are
         * chained in ep->ovflist and requeued later on.
         */
        if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
            if (epi->next == EP_UNACTIVE_PTR) {
                epi->next = ep->ovflist;
                ep->ovflist = epi;
                if (epi->ws) {
                    /*
                     * Activate ep->ws since epi->ws may get
                     * deactivated at any time.
                     */
                    __pm_stay_awake(ep->ws);
                }
    
            }
            goto out_unlock;
        }
    
        /* If this file is already in the ready list we exit soon */
        if (!ep_is_linked(&epi->rdllink)) {
            list_add_tail(&epi->rdllink, &ep->rdllist);
            ep_pm_stay_awake_rcu(epi);
        }
    
        /*
         * Wake up ( if active ) both the eventpoll wait list and the ->poll()
         * wait list.
         */
        if (waitqueue_active(&ep->wq)) {
            if ((epi->event.events & EPOLLEXCLUSIVE) &&
                        !((unsigned long)key & POLLFREE)) {
                switch ((unsigned long)key & EPOLLINOUT_BITS) {
                case POLLIN:
                    if (epi->event.events & POLLIN)
                        ewake = 1;
                    break;
                case POLLOUT:
                    if (epi->event.events & POLLOUT)
                        ewake = 1;
                    break;
                case 0:
                    ewake = 1;
                    break;
                }
            }
            wake_up_locked(&ep->wq);
        }
        if (waitqueue_active(&ep->poll_wait))
            pwake++;
    
    out_unlock:
        spin_unlock_irqrestore(&ep->lock, flags);
    
        /* We have to call this outside the lock */
        if (pwake)
            ep_poll_safewake(&ep->poll_wait);
    
        if (epi->event.events & EPOLLEXCLUSIVE)
            return ewake;
    
        return 1;
    }
    View Code
    /*
     * This is the callback that is used to add our wait queue to the
     * target file wakeup lists.
     */
    static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                     poll_table *pt)
    {
        struct epitem *epi = ep_item_from_epqueue(pt);
        struct eppoll_entry *pwq;
    
        if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
            init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
            pwq->whead = whead;
            pwq->base = epi;
            if (epi->event.events & EPOLLEXCLUSIVE)
                add_wait_queue_exclusive(whead, &pwq->wait);
            else
                add_wait_queue(whead, &pwq->wait);
            list_add_tail(&pwq->llink, &epi->pwqlist);
            epi->nwait++;
        } else {
            /* We have to signal that an error occurred */
            epi->nwait = -1;
        }
    }

    根据EPOLLEXCLUSIVE 加入到不同的唤醒 队列add_wait_queue_exclusive  add_wait_queue

    在wake_up的时候 通过nr-exclu  控制 但是 要想break还需要返回 ep_poll_callback返回 true;

    对于epoll_oneshot 

    在epoll_wait后  send fd 到user时,

    if (epi->event.events & EPOLLONESHOT)  不会重复添加进去 导致后续链式唤醒
                    epi->event.events &= EP_PRIVATE_BITS;
      else if (!(epi->event.events & EPOLLET)) {
                    list_add_tail(&epi->rdllink, &ep->rdllist);
                    ep_pm_stay_awake(epi);
                }

    同时在ep_call_back的时候也会 继续检查 EPOLLONESHOT ,房子 epoll_wait返回时,在处理data中,fd又有数据需要相应,此时多线程 中别的线程可以相应。。。。乱序了!!!


        /*
         * If the event mask does not contain any poll(2) event, we consider the descriptor to be disabled. This condition is likely the effect of the
         * EPOLLONESHOT bit that disables the descriptor when an event is received,* until the next EPOLL_CTL_MOD will be issued.
         */
        if (!(epi->event.events & ~EP_PRIVATE_BITS))
            goto out_unlock; 

  • 相关阅读:
    ReentrantLock和AQS
    CAS
    java8中ConcurrentHashMap
    java8中的HashMap
    TCP和UDP
    慢查询日志和profiling
    explain的使用
    select、poll、epoll之间的区别
    I/O模型
    生产者-消费者模式
  • 原文地址:https://www.cnblogs.com/codestack/p/13040907.html
Copyright © 2020-2023  润新知