本文主要分析accept()的阻塞等待和唤醒。
内核版本:3.6
Author:zhangskd @ csdn blog
等待队列
(1)socket的等待队列
/* * @sk_wq: sock wait queue head and async head */ struct sock { ... struct socket_wq __rcu *sk_wq; /* 套接字的等待队列 */ ... };
struct socket_wq { /* Note: wait MUST be first field of socket_wq */ wait_queue_head_t wait; /* 等待队列 */ struct fasync_struct *fasync_list; /* 异步文件操作 */ struct rcu_head rcu; /* 更新时的回调函数 */ } __cacheline_aligned_in_smp; struct __wait_queue_head { spinlock_t lock; struct list_head task_list; }; typedef struct __wait_queue_head wait_queue_head_t; /* 等待队列头 */
(2)进程的等待任务
struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 void *private; /* 指向当前的进程控制块 */ wait_queue_func_t func; /* 唤醒函数 */ struct list_head task_list; /* 用于链接入等待队列 */ }; typedef struct __wait_queue wait_queue_t; typedef int (*wait_queue_func_t) (wait_queue_t *wait, unsigned mode, int flags, void *key); int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
初始化等待任务。
#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) #define DEFINE_WAIT_FUNC(name, function) wait_queue_t name = { .private = current, .func = function, .task_list = LIST_HEAD_INIT((name).task_list), } int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); /* 默认的唤醒函数 */ if (ret) list_del_init(&wait->task_list); /* 从等待队列中删除,初始化此等待任务 */ return ret; }
获取sock的等待队列。
static inline wait_queue_head_t *sk_sleep(struct sock *sk) { BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0); return &rcu_dereference_raw(sk->sk_wq)->wait; }
把等待任务加入到等待队列中,设置当前进程的状态。
void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) { unsigned long flags; /* 这个标志表示一次只唤醒一个等待任务,避免惊群现象 */ wait->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) __add_wait_queue_tail(q, wait); /* 把此等待任务加入到等待队列中 */ set_current_state(state); /* 设置当前进程的状态 */ spin_unlock_irqrestore(&q->lock, flags); } static inline void __add_wait_queue_tail(wait_queue_head_t *head, wait_queue_t *new) { list_add_tail(&new->task_list, &head->task_list); } #define set_current_state(state_value) set_mb(current->state, (state_value))
(3)accept()的阻塞等待
accept()超时时间为sk->sk_rcvtimeo,在sock_init_data()中初始化为MAX_SCHEDULE_TIMEOUT,表示无限等待。
/* Wait for an incoming connection, avoid race conditions. * This must be called with the socket locked. */ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) { struct inet_connection_sock *icsk = inet_csk(sk); DEFINE_WAIT(wait); /* 初始化等待任务 */ int err; for (; ;) { /* 把等待任务加入到socket的等待队列中,把进程状态设置为TASK_INTERRUPTIBLE */ prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); /* 等下可能要睡觉了,先释放 */ if (reqsk_queue_empty(&icsk->icsk_accept_queue)) /* 如果全连接队列为空 */ timeo = schedule_timeout(timeo); /* 进入睡眠,直到超时或收到信号 */ lock_sock(sk); /* 醒来后重新上锁 */ err = 0; if (! reqsk_queue_empty(&icsk->icsk_accept_queue)) /* 全连接队列不为空时,退出 */ break; err = -EINVAL; if (sk->sk_state != TCP_LISTEN) /* 如果sock不处于监听状态了,退出 */ break; err = sock_intr_errno(timeo); /* 如果进程有待处理的信号,退出。 * 因为timeo默认为MAX_SCHEDULE_TIMEOUT,所以err默认为-ERESTARTSYS。 * 接下来会重新调用此函数,所以accept()依然阻塞。 */ if (signal_pending(current)) break; err = -EAGAIN; if (! timeo) /* 如果等待超时,即超过用户设置的sk->sk_rcvtimeo,退出 */ break; } finish_wait(sk_sleep(sk), &wait); return err; }
/** * schedule_timeout - sleep until timeout * @timeout: timeout value in jiffies * * Make the current task sleep until @timeout jiffies have elapsed. The routine * will return immediately unless the current task state has been set (see set_current_state()). * * You can set the task state as follows - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to pass before * the routine returns. The routine will return 0. * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is delivered to the * current task. In this case the remaining time in jiffies will be returned, or 0 if the timer * expired in time. * * The current task state is guaranteed to be TASK_RUNNING when this routine returns. * * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule the CPU * away without a bound on the timeout. In this case the return value will be * %MAX_SCHEDULE_TIMEOUT. * * In all cases the return value is guaranteed to be non-negative. */ signed long __sched schedule_timeout(signed long timeout) {}
因为sk->sk_rcvtimeo默认值为MAX_SCHEDULE_TIMEOUT,所以返回-ERESTARTSYS,即告诉系统
重新执行accept()的系统调用。
static inline int sock_intr_errno(long timeo) { return timeo == MAX_SCHEDULE_TIMEOUT? -ERESTARTSYS : -EINTR; /* Interrupted system call */ }
从等待队列中删除等待任务,把当前进程的状态置为可运行。
/** * finish_wait - clean up after waiting in a queue * @q: waitqueue waited on,等待队列头 * @wait: wait descriptor,等待任务 * * Sets current thread back to running state and removes the wait * descriptor from the given waitqueue if still queued. */ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; __set_current_state(TASK_RUNNING); if (! list_empty_careful(&wait->task_list)) { spin_lock_irqsave(&q->lock, flags); list_del_init(&wait->task_list); /* 从等待队列中删除,初始化此等待任务 */ spin_unlock_irqrestore(&q->lock, flags); } }
(4)accept()的唤醒
当收到客户端的ACK后,经过如下调用:
tcp_v4_rcv
tcp_v4_do_rcv
tcp_child_process
sock_def_readable
wake_up_interruptible_sync_poll
__wake_up_sync_key
__wake_up_common
最终调用我们给等待任务注册的唤醒函数。
我们来看下accept()是如何避免惊群现象的。
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int wake_flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; if (curr->func(curr, mode, wake_flags, key) && (flags & WQ_FLAG_EXCLUSIVE) !--nr_exclusive) break; } }
初始化等待任务时,flags |= WQ_FLAG_EXCLUSIVE。传入的nr_exclusive为1,表示只允许唤醒一个等待任务。
所以这里只会唤醒一个等待的进程,不会导致惊群现象。