等待队列在内核中很有用途,尤其用在中断处理、进程同步及定时。
等待队列实现了在事件上的条件等待:希望等待特定事件的进程把自己放进合适的等待队列,并放弃控制权。因为,等待队列表示一组睡眠的进程,当某一条件变为真时,由内核唤醒他们。
1.每个等待队列都有一个等待队列头(wait queue head),等待队列头是一个类型为wait_queue_head_t的数据结构:
struct __wait_queue_head { spinlock_t lock; struct list_head task_list; }; typedef struct __wait_queue_head wait_queue_head_t;
因为等待队列是由中断处理程序和主要内核函数修改的,因此必须对其双向链表进行保护以免对其进行同时访问,因为同时访问会导致不可预测的后果。同步是通过等待队列头中的lock自旋锁达到的。task_list字段是等待进程链表的头。
2.等待队列链表中的元素类型为wait_queue_t:
struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 void *private; wait_queue_func_t func; struct list_head task_list; }; typedef struct __wait_queue wait_queue_t;
flags为1表示睡眠进程是互斥进程,由内核有选择的唤醒。
flags为0表示睡眠进程是非互斥进程,由内核在等待条件成立时唤醒。
WQ_FLAG_EXCLUSIVE用于赋给flgas,表示该睡眠进程是一个互斥进程。
private是指向进程描述符的指针,用于指向struct task_struct类型的进程描述符p/current。
func表示等待队列中睡眠进程应该用什么方式唤醒。
task_list表示要插入等待队列头的链表指针。
3.DECLARE_WAIT_QUEUE_HEAD
#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { .lock = __SPIN_LOCK_UNLOCKED(name.lock), .task_list = { &(name).task_list, &(name).task_list } } #define DECLARE_WAIT_QUEUE_HEAD(name) wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
DECLARE_WAIT_QUEUE_HEAD(name)用来声明并初始化一个等待队列头,如:
DECLARE_WAIT_QUEUE_HEAD(wq_head);
4.DECLARE_WAITQUEUE
#define __WAITQUEUE_INITIALIZER(name, tsk) { .private = tsk, .func = default_wake_function, .task_list = { NULL, NULL } } #define DECLARE_WAITQUEUE(name, tsk) wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
DECLARE_WAITQUEUE(name, tsk)用来声明并初始化一个等待队列项,如:
DECLARE_WAITQUEUE(wq, current);
5.init_waitqueue_head
void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) { spin_lock_init(&q->lock); lockdep_set_class(&q->lock, key); INIT_LIST_HEAD(&q->task_list); } #define init_waitqueue_head(q) do { static struct lock_class_key __key; __init_waitqueue_head((q), &__key); } while (0)
init_waitqueue_head(q)可以用来初始化动态分配的等待队列头变量,如:
wait_queue_head_t wq_head;
init_waitqueue_head(wq_head);
6.init_waitqueue_entry
static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) { q->flags = 0; q->private = p; q->func = default_wake_function; }
init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)用于初始化一个等待队列项,即把一个进程传递给一个wait_queue_t类型的变量,作用同
DECLARE_WAITQUEUE(name, tsk)一样,只不过前者是初始化动态分配的变量,后者只需传递一个等待队列项的名字和进程即可,使用如下:
wait_queue_t wq;
init_waitqueue_entry(&wq, current);
7.init_waitqueue_func_entry
static inline void init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func) { q->flags = 0; q->private = NULL; q->func = func; }
init_waitqueue_func_entry用于初始化等待队列项的自定义唤醒函数。
8.add_wait_queue
static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new) { list_add(&new->task_list, &head->task_list); } void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; wait->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); __add_wait_queue(q, wait); spin_unlock_irqrestore(&q->lock, flags); }
add_wait_queue的作用是将一个等待队列项加入到等待队列头中。如将上面3,4或5,6的wq加入到wq_head:
add_wait_queue(&wq_head, &wq);
9.add_wait_queue_exclusive
void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; wait->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); __add_wait_queue_tail(q, wait); spin_unlock_irqrestore(&q->lock, flags); }
add_wait_queue_exclusive的作用是将一个互斥进程的等待队列项加入等待队列头,可以看出其与add_wait_queue函数体的不同之处在于
wait->flags |= WQ_FLAG_EXCLUSIVE;add_wait_queue中是将flags的最低位清0.
10.remove_wait_queue
static inline void __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) { list_del(&old->task_list); } void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; spin_lock_irqsave(&q->lock, flags); __remove_wait_queue(q, wait); spin_unlock_irqrestore(&q->lock, flags); }
remove_wait_queue的作用是从等待队列中删除一个链表节点。
11.__wake_up_common与__wake_up
/* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve * number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int wake_flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; if (curr->func(curr, mode, wake_flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } } /** * __wake_up - wake up threads blocked on a waitqueue. * @q: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function * * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; spin_lock_irqsave(&q->lock, flags); __wake_up_common(q, mode, nr_exclusive, 0, key); spin_unlock_irqrestore(&q->lock, flags); }
通过核心函数__wake_up_common我们可以看出,当指定唤醒的互斥进程数为0(nr_exclusive == 0)时,此函数会唤醒处在同一等待队列上的所有节点。当指定唤醒的互斥进程数为非0时,此函数会唤醒同一等待队列上的所有的非互斥进程和一个互斥进程。我们通常将等待相同条件的等待进程加入到同一个等待队列上。如果等待队列上有互斥进程,那么依据互斥对临界区的访问规则,我们只需唤醒其中一个互斥进程即可,唤醒多个互斥进程的结果还是会让他们因抢占临界区而进入竞态,如果等待队列上没有互斥进程,那么在执行wake_up系列函数时,我们全唤醒即可。
12.wake_up系列函数
#define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) #define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL) #define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL) #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL) #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) #define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL) #define wake_up_interruptible_sync(x) __wake_up_sync((x), TASK_INTERRUPTIBLE, 1)
13.wait_event
/** * finish_wait - clean up after waiting in a queue * @q: waitqueue waited on * @wait: wait descriptor * * Sets current thread back to running state and removes * the wait descriptor from the given waitqueue if still * queued. */ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; __set_current_state(TASK_RUNNING); /* * We can check for list emptiness outside the lock * IFF: * - we use the "careful" check that verifies both * the next and prev pointers, so that there cannot * be any half-pending updates in progress on other * CPU's that we haven't seen yet (and that might * still change the stack area. * and * - all other users take the lock (ie we can only * have _one_ other CPU that looks at or modifies * the list). */ if (!list_empty_careful(&wait->task_list)) { spin_lock_irqsave(&q->lock, flags); list_del_init(&wait->task_list); spin_unlock_irqrestore(&q->lock, flags); } }
#define DEFINE_WAIT_FUNC(name, function)
wait_queue_t name = {
.private = current,
.func = function,
.task_list = LIST_HEAD_INIT((name).task_list),
}
#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)
/* * Note: we use "set_current_state()" _after_ the wait-queue add, * because we need a memory barrier there on SMP, so that any * wake-function that tests for the wait-queue being active * will be guaranteed to see waitqueue addition _or_ subsequent * tests in this thread will see the wakeup having taken place. * * The spin_unlock() itself is semi-permeable and only protects * one way (it only protects stuff inside the critical region and * stops them from bleeding out - it would still allow subsequent * loads to move into the critical region). */ void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) { unsigned long flags; wait->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) __add_wait_queue(q, wait); set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } #define __wait_event(wq, condition) do { DEFINE_WAIT(__wait); for (;;) { prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); if (condition) break; schedule(); } finish_wait(&wq, &__wait); } while (0) /** * wait_event - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event(wq, condition) do { if (condition) break; __wait_event(wq, condition); } while (0)
在使用wait_event时,我们只需传入一个wait_queue_head_t类型的等待队列头wq和一个等待条件condition.
__wait_event中,我们先通过DEFINE_WAIT声明定义一个wait_queue_t类型的变量__wait,然后将当前进程current赋给等待队列项__wait变量成员的.
private.
prepare_to_wait先判断等待队列头wq是否为空,为空则将__wait加入到wq,然后将当前进程设为不可中断的等待状态TASK_UNINTERRUPTIBLE。
回到__wait_event中判断condition是否成立,不成立则让CPU调度执行其他的进程,否则退出for(;;)循环执行finish_wait,finish_wait的作用是将当前进程设为TASK_RUNNING状态,并将等待队列项从等待队列中删除。
14.wait_event系列函数
#define __wait_event_timeout(wq, condition, ret) do { DEFINE_WAIT(__wait); for (;;) { prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); if (condition) break; ret = schedule_timeout(ret); if (!ret) break; } finish_wait(&wq, &__wait); } while (0) /** * wait_event_timeout - sleep until a condition gets true or a timeout elapses * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function returns 0 if the @timeout elapsed, and the remaining * jiffies if the condition evaluated to true before the timeout elapsed. */ #define wait_event_timeout(wq, condition, timeout) ({ long __ret = timeout; if (!(condition)) __wait_event_timeout(wq, condition, __ret); __ret; }) #define __wait_event_interruptible(wq, condition, ret) do { DEFINE_WAIT(__wait); for (;;) { prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); if (condition) break; if (!signal_pending(current)) { schedule(); continue; } ret = -ERESTARTSYS; break; } finish_wait(&wq, &__wait); } while (0) /** * wait_event_interruptible - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible(wq, condition) ({ int __ret = 0; if (!(condition)) __wait_event_interruptible(wq, condition, __ret); __ret; }) #define __wait_event_interruptible_timeout(wq, condition, ret) do { DEFINE_WAIT(__wait); for (;;) { prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); if (condition) break; if (!signal_pending(current)) { ret = schedule_timeout(ret); if (!ret) break; continue; } ret = -ERESTARTSYS; break; } finish_wait(&wq, &__wait); } while (0) /** * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it * was interrupted by a signal, and the remaining jiffies otherwise * if the condition evaluated to true before the timeout elapsed. */ #define wait_event_interruptible_timeout(wq, condition, timeout) ({ long __ret = timeout; if (!(condition)) __wait_event_interruptible_timeout(wq, condition, __ret); __ret; })
wait_event_timeout把wait_event的schedule()改成了schedule_timeout()。
wait_event_interruptible把wait_event的TASK_UNINTERRUPTIBLE改成了TASK_INTERRUPTIBLE。
wait_event_interruptible_timeout把wait_event的schedule()与TASK_UNINTERRUPTIBLE改成了schedule_timeout()和TASK_INTERRUPTIBLE。
以上代码均来自kernel/include/linux/wait.h, kernel/kernel/sched.c, kernel/kernel/wait.c.