一、概述 linux 2.6.23中采用了一个全新的调度策略CFS(Completely Fair Scheduler)来处理非实时进程。 二、主要数据结构 1.为了和原先的实时策略更好的融合,linux在实现CFS之余,还将内核的调度策略模块化,添加了新的结构体sched_class用于管理不同的调度器。 2.CFS没有用传统的调度器中时间片的概念,而是使用了新的结构体sched_entity来跟踪一个进程的运行时间。 se也可表示组调度,在此不做分析,所以这里se代表一个进程。 struct sched_entity { struct load_weight load; /* for load-balancing */ //se的权重 struct rb_node run_node; //在红黑树上的节点 unsigned int on_rq; //该se是否在rq上 u64 exec_start; //当前cfs_rq的时间,用于计算时间差 u64 sum_exec_runtime; //进程总共运行的时间,real-run time u64 vruntime; //进程的virtual-run time u64 prev_sum_exec_runtime; //进程在醒来的时间 u64 nr_migrations; }; 3.rq 三、tick中断处理 每一次进入tick中断后,会进入scheduler_tick函数来进行scheduler相关的处理: void scheduler_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; raw_spin_lock(&rq->lock); /* 更新运行队列的时间,rq->clock和rq->clock_task,可以看出在真正计算 时间的时候用的是clock_task */ update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); perf_event_task_tick(); } 由于添加了模块化的架构,如果采用的是CFS,将会跳转到函数task_tick_fair进而到函数entity_tick。 这个函数的作用是更新进程的时间数据,在判断是否被其他进程抢占。 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) { /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); if (cfs_rq->nr_running > 1) check_preempt_tick(cfs_rq, curr); } 正如函数注释所说,update_curr用来更新run-time的统计数据,系统会根据这些数据最终决定调度哪个进程。 static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; u64 now = rq_of(cfs_rq)->clock_task; unsigned long delta_exec; if (unlikely(!curr)) return; /* * Get the amount of time the current task was running * since the last time we changed load (this cannot * overflow on 32 bits): */ /* delta_exec为自从上次变动以来的时间。 */ delta_exec = (unsigned long)(now - curr->exec_start); if (!delta_exec) return; __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; } /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. */ static inline void __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { unsigned long delta_exec_weighted; //更新进程的真实运行时间 curr->sum_exec_runtime += delta_exec; /* calc_delta_fair用来将真实时间转化为虚拟时间。 进程的优先级不同,它在系统中的地位(也就是权重)也不同 进程的优先级越高,它的虚拟时间走的越慢。 */ delta_exec_weighted = calc_delta_fair(delta_exec, curr); //更新进程的虚拟运行时间 curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); } 每个进程在其产生(fork)的时候,都会根据其父亲的优先级产生它的优先级和权重(sched_fork函数)。 /* * delta /= w */ static inline unsigned long calc_delta_fair(unsigned long delta, struct sched_entity *se) { //如果该进程拥有nice为0的权重,这是他的虚拟时钟和真实时钟是一样速度的。 if (unlikely(se->load.weight != NICE_0_LOAD)) delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); return delta; } 从注释来看calc_delta_mine的计算公式为delta *= weight / lw,也就是说进程的权重越大,时钟走的越慢,而且是线性的。 min_vruntime是cfs的rq中的一个成员,是cfs时间的基准,在cfs中起这至关重要的作用。 自cfs产生以来,这部分的代码改动也是很频繁的。 static void update_min_vruntime(struct cfs_rq *cfs_rq) { u64 vruntime = cfs_rq->min_vruntime; /* 由于当前运行的进程是不在红黑树上的,所以关于cfs_rq->min_vruntime的更新 必须要考虑当前的进程,以免产生不公平,这是以前的调度器所疏忽的。 如果有当前进程,就以当前进程作为基准计算 */ if (cfs_rq->curr) vruntime = cfs_rq->curr->vruntime; if (cfs_rq->rb_leftmost) { struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, struct sched_entity, run_node); if (!cfs_rq->curr) /* 如果没有当前进程,这个在什么时候出现? 其他策略的进程在运行时? 就不用考虑它了,就是最小的运行时间 */ vruntime = se->vruntime; else /* 如果有当前进程,还要考虑这个最小的运行时间 */ vruntime = min_vruntime(vruntime, se->vruntime); } //最后,更新cfs_rq->min_vruntime,这个值是单调增加的。 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); } 分析完update_curr函数,我们回到entity_tick函数,接着往下看。下面两行的作用是判断当前的进程是否需要被抢占。能够发生抢占首要条件是nr_running大于1。 /* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { unsigned long ideal_runtime, delta_exec; struct sched_entity *se; s64 delta; //计算curr进程的理想运行时间 ideal_runtime = sched_slice(cfs_rq, curr); //计算该进程实际运行时间 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; //如果实际运行的时间超出了它应该运行的时间,则set该进程的TIF_NEED_RESCHED标志位 if (delta_exec > ideal_runtime) { resched_task(rq_of(cfs_rq)->curr); //如注释,如果当前进程运行了足够时间,就取消它在buddy中的优先权 /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. */ clear_buddies(cfs_rq, curr); return; } /* 这里是第二个抢占条件:最小虚拟运行时间的进程和当前进程的虚拟运行时间进行比较, 如果后者比前者大了 ideal_runtime,就需要进行调度。 为什么要用虚拟时间个sched_slice产生的真实时间进行比较呢? 大了ideal_runtime又能代表什么呢? */ /* * Ensure that a task that missed wakeup preemption by a * narrow margin doesn't have to wait for a full slice. * This also mitigates buddy induced latencies under load. */ if (delta_exec < sysctl_sched_min_granularity) return; se = __pick_first_entity(cfs_rq); delta = curr->vruntime - se->vruntime; if (delta < 0) return; if (delta > ideal_runtime) resched_task(rq_of(cfs_rq)->curr); } sched_slice函数用来计算一个进程时间基准(wall time),一个进程的理论运行时间是和整个cfs_rq中的进程数量和权重有关系的。 不考虑调度组的话,函数等价于: /* * We calculate the wall-time slice from the period by taking a part * proportional to the weight. * * s = p*P[w/rw] */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* __sched_period这个函数得到的是每一个进程运行一次的时间总和,公式为: p = (nr <= nl) ? l : mg * nr l :系统常数,为调度延时,就是系统所有进程运行一次的时间总和 nl :系统常数,系统活动进程的上限 nr :当前的进程数 mg :系统常数,最小的调度时间间隔 */ u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); struct load_weight *load; struct load_weight lw; load = &cfs_rq->load; //这里和上面都考虑了情况,当进程刚刚创建时,当前进程不在运行队列中,为了计算合理,就临时加上去。 if (unlikely(!se->on_rq)) { lw = cfs_rq->load; update_load_add(&lw, se->load.weight); load = &lw; } //根据权重得到属于自己的那份时间 slice = calc_delta_mine(slice, se->load.weight, load); return slice; } 四、新进程中的调度 1.sched_fork出现在copy_process中。 /* * fork()/clone()-time setup: */ void sched_fork(struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); //初始化task_struct中调度器相关的成员。 __sched_fork(p); /* * We mark the process as running here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; //确保临时的优先级的提升不会继承到新的进程中 /* * Make sure we do not leak PI boosting priority to the child. */ p->prio = current->normal_prio; //如果设置了sched_reset_on_fork,会恢复默认调度策略 /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { if (task_has_rt_policy(p)) { p->policy = SCHED_NORMAL; p->static_prio = NICE_TO_PRIO(0); p->rt_priority = 0; } else if (PRIO_TO_NICE(p->static_prio) < 0) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = __normal_prio(p); //根据优先级和调度策略设置权重 set_load_weight(p); /* * We don't need the reset flag anymore after the fork. It has * fulfilled its duty: */ p->sched_reset_on_fork = 0; } //如果不是实时进程,就用CFS调度 if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; if (p->sched_class->task_fork) p->sched_class->task_fork(p); ....... } 进入CFS中的task_fork_fair函数。 /* * called on fork with the child task as argument from the parent's context * - child not yet on the tasklist * - preemption disabled */ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(current); struct sched_entity *se = &p->se, *curr = cfs_rq->curr; int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); unsigned long flags; raw_spin_lock_irqsave(&rq->lock, flags); //更新rq的时钟 update_rq_clock(rq); //更新cfs_rq的统计数据 update_curr(cfs_rq); //子进程虚拟时间以 父进程的虚拟时间为基准 if (curr) se->vruntime = curr->vruntime; //调整子进程虚拟时间 place_entity(cfs_rq, se, 1); /* 参数sysctl_sched_child_runs_first强制子进程在父进程之前运行。 entity_before(curr, se)判断是否需要进行虚拟运行时间的对调, 只有父进程的虚拟运行时间小于子进程的虚拟运行时间时才需要此操作。 */ if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); resched_task(rq->curr); } /* 将vruntime减去一个cfs_rq->min_vruntime,使得vruntime变成一个相对的时间, 这样做好像是为了在SMP的情况下有一个标准的接口,这个我还不太清楚, 有机会再研究 */ se->vruntime -= cfs_rq->min_vruntime; raw_spin_unlock_irqrestore(&rq->lock, flags); } static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { //基准为min_vruntime u64 vruntime = cfs_rq->min_vruntime; /* * The 'current' period is already promised to the current tasks, * however the extra weight of the new task will slow them down a * little, place the new task so that it fits in the slot that * stays open at the end. */ /* 新进程开始的时候,基准值要比min_vruntime稍微慢一些。 */ if (initial) vruntime += sched_vslice(cfs_rq, se); /* sleeps up to a single latency don't count. */ if (!initial) { /* 这里是睡眠进程唤醒时的时间处理。 因为进程休眠了,se->vruntime肯定很小,但是如果太小 的话,又会抢其他进程运行的机会。所以这里采用了 cfs_rq->min_vruntime - thresh。即保证了它调度的优先权, 又不至于太小,影响其他进程。 */ unsigned long thresh = sysctl_sched_latency; /* * Halve their sleep time's effect, to allow * for a gentler effect of sleepers: */ if (sched_feat(GENTLE_FAIR_SLEEPERS)) thresh >>= 1; vruntime -= thresh; } //这里保证vruntime只能往大了调整 /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); se->vruntime = vruntime; } 2.创建新的进程的另一个调度相关的函数是wake_up_new_task,作用见下面注释。 /* * wake_up_new_task - wake up a newly created task for the first time. * * This function will do some initial scheduler statistics housekeeping * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ void wake_up_new_task(struct task_struct *p) { unsigned long flags; struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); rq = __task_rq_lock(p); //添加进程到运行队列中 activate_task(rq, p, 0); p->on_rq = 1; //判断新的进程是否能抢占当前进程 check_preempt_curr(rq, p, WF_FORK); task_rq_unlock(rq, p, &flags); } activate_task最终会调到CFS中的函数enqueue_task_fair /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; //如果已经在运行队列中,什么也不做就返回 if (se->on_rq) return; enqueue_entity(cfs_rq, se, flags); cfs_rq->h_nr_running++; } static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Update the normalized vruntime before updating min_vruntime * through callig update_curr(). */ /* 如果是新创建的进程,可以看到前面减去了一个cfs_rq->min_vruntime, 这里再加回来。 */ if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) se->vruntime += cfs_rq->min_vruntime; /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); if (flags & ENQUEUE_WAKEUP) { //如果是进程唤醒时的操作(try_to_wake_up),就会进这里 place_entity(cfs_rq, se, 0); } account_entity_enqueue(cfs_rq, se); //添加进程到红黑树 if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; } static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { //添加进程的权重到cfs的权重中 update_load_add(&cfs_rq->load, se->load.weight); //添加进程的权重到rq的权重中 inc_cpu_load(rq_of(cfs_rq), se->load.weight); //cfs_rq的进程数量加一 cfs_rq->nr_running++; } static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { const struct sched_class *class; if (p->sched_class == rq->curr->sched_class) { rq->curr->sched_class->check_preempt_curr(rq, p, flags); } else { for_each_class(class) { if (class == rq->curr->sched_class) break; if (class == p->sched_class) { resched_task(rq->curr); break; } } } /* * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) rq->skip_clock_update = 1; } 判断当前进程是否能被进程p抢占 /* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; / /要唤醒的进程和当前运行的进程是同一个,就返回 if (unlikely(se == pse)) return; /* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. * * Note: this also catches the edge-case of curr being in a throttled * group (e.g. via set_curr_task), since update_curr() (in the * enqueue of curr) will have resulted in resched being set. This * prevents us from potentially nominating it as a false LAST_BUDDY * below. */ //如果调度位已经设置,就不再往下在走了 if (test_tsk_need_resched(curr)) return; /* Idle tasks are by definition preempted by non-idle tasks. */ //下面这种情况是一定需要抢占的。 if (unlikely(curr->policy == SCHED_IDLE) && likely(p->policy != SCHED_IDLE)) goto preempt; /* * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick): */ //如果不是使用CFS,就返回 if (unlikely(p->policy != SCHED_NORMAL)) return; update_curr(cfs_rq_of(se)); //判断是否需要抢占的核心函数 if (wakeup_preempt_entity(se, pse) == 1) { /* * Bias pick_next to pick the sched entity that is * triggering this preemption. */ //last指向最后一个别调度出去的进程。 if (!next_buddy_marked) set_next_buddy(pse); goto preempt; } return; preempt: resched_task(curr); /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved * with schedule on the ->pre_schedule() or idle_balance() * point, either of which can * drop the rq lock. * * Also, during early boot the idle thread is in the fair class, * for obvious reasons its a bad idea to schedule back to it. */ if (unlikely(!se->on_rq || curr == rq->idle)) return; if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) set_last_buddy(se); } /* * Should 'se' preempt 'curr'. * * |s1 * |s2 * |s3 * g * |<--->|c * * w(c, s1) = -1 * w(c, s2) = 0 * w(c, s3) = 1 * */ static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) { //计算两个虚拟时间之差 s64 gran, vdiff = curr->vruntime - se->vruntime; //如果se的虚拟时间比curr还大,说明本该curr执行,无需抢占 if (vdiff <= 0) return -1; /* gran为需要抢占的时间差,只有两个时间差大于需要抢占的时间差, 才需要抢占,这里避免了太频繁的抢占。 */ gran = wakeup_gran(curr, se); if (vdiff > gran) return 1; return 0; } 五、schedule函数 1.如果需要调度的话就有将当前进程从运行队列中清出去,这里调用函数deactivate_task,flags为DEQUEUE_SLEEP。 /* * deactivate_task - remove a task from the runqueue. */ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible++; dequeue_task(rq, p, flags); } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { //更新rq时间 update_rq_clock(rq); p->sched_class->dequeue_task(rq, p, flags); } CFS中,调用函数dequeue_task_fair /* * The dequeue_task method is called before nr_running is * decreased. We remove the task from the rbtree and * update the fair scheduling stats: */ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); cfs_rq->h_nr_running--; } static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { //更新一些统计数据 } //如果进程退出,清掉buddy里的指针 clear_buddies(cfs_rq, se); if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; //account_entity_enqueue的相反操作 account_entity_dequeue(cfs_rq, se); /* 还是为了SMP */ /* * Normalize the entity after updating the min_vruntime because the * update can refer to the ->curr item and we need to reflect this * movement in our normalized position. */ if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; update_min_vruntime(cfs_rq); } 2.忽略SMP,第二个操作就是put_prev_task static void put_prev_task(struct rq *rq, struct task_struct *prev) { if (prev->on_rq || rq->skip_clock_update < 0) update_rq_clock(rq); prev->sched_class->put_prev_task(rq, prev); } /* * Account for a descheduled task: */ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; cfs_rq = cfs_rq_of(se); put_prev_entity(cfs_rq, se); } static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { if (prev->on_rq) { /* 如下面解释,被调度的进程有可能还是在运行队列上的,例如在 调度的时候收到了一个信号。 */ /* * If still on the runqueue then deactivate_task() * was not called and update_curr() has to be done: */ update_curr(cfs_rq); /* Put 'current' back into the tree. */ /* 正在运行进程,是不在红黑树中的,set_next_entity有这个处理, 运行的进程并不在红黑树中移动。 所以既然不运行了,还是要添加到运行队列中去。 */ __enqueue_entity(cfs_rq, prev); } cfs_rq->curr = NULL; } 3.pick_next_task选择下一个要运行的进程 /* * Pick up the highest-prio task: */ static inline struct task_struct * pick_next_task(struct rq *rq) { const struct sched_class *class; struct task_struct *p; /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ //如果所有的进程都是CFS的进程,就用这个CFS的方法处理 if (likely(rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq); if (likely(p)) return p; } for_each_class(class) { p = class->pick_next_task(rq); if (p) return p; } } CFS中的选择下一个进程的函数: static struct task_struct *pick_next_task_fair(struct rq *rq) { struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; //如果没有进程在运行队列中,就返回NULL if (!cfs_rq->nr_running) return NULL; se = pick_next_entity(cfs_rq); set_next_entity(cfs_rq, se); p = task_of(se); return p; } /* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups * 2) pick the "next" process, since someone really wants that to run * 3) pick the "last" process, for cache locality * 4) do not run the "skip" process, if something else is available */ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { //取下红黑树中最坐边的节点 struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *left = se; /* 如果这个进程是被skip的(例如利用sched_yield函数), 且如果下一个进程和第一个进程虚拟时间没有差得太远, 就选择下一个进程运行 */ /* * Avoid running the skip buddy, if running something else can * be done without getting too unfair. */ if (cfs_rq->skip == se) { struct sched_entity *second = __pick_next_entity(se); if (second && wakeup_preempt_entity(second, left) < 1) se = second; } /* * Prefer last buddy, try to return the CPU to a preempted task. */ //最有被调度出去的进程有优先执行权 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) se = cfs_rq->last; clear_buddies(cfs_rq, se); return se; } static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* 'current' is not kept within the tree. */ if (se->on_rq) { //从这里看出cfs_rq->curr并不在红黑树中 __dequeue_entity(cfs_rq, se); } //设置exec_start为cfs_rq的clock_task,重新开始计时 update_stats_curr_start(cfs_rq, se); cfs_rq->curr = se; //记录下这个时间到prev_sum_exec_runtime se->prev_sum_exec_runtime = se->sum_exec_runtime; } 六、唤醒进程 /** * try_to_wake_up - wake up a thread * @p: the thread to be awakened * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual * re-schedule is in progress), and as such you're allowed to do * the simpler "current->state = TASK_RUNNING" to mark yourself * runnable without the overhead of this. * * Returns %true if @p was woken up, %false if it was already running * or @state didn't match @p's state. */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { unsigned long flags; int success = 0; raw_spin_lock_irqsave(&p->pi_lock, flags); //如果需要唤醒的进程的状态不是唤醒程序的状态,就什么都不做 if (!(p->state & state)) goto out; success = 1; /* we're going to change ->state */ //如果进程一直都在rq上,就做一次远程唤醒 if (p->on_rq && ttwu_remote(p, wake_flags)) goto out; ttwu_queue(p, 0); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); return success; } 如下面注释所说,这是一次轻量级的唤醒,比如在进入schedule的时候,检测到了一个信号,这个进程就不会从rq上拿出。 如果是这种情况,处理起来就相对简单了,仅仅是检查一下进程是否能够抢占当前进程,再将进程的状态设为TASK_RUNNING。 /* * Called in case the task @p isn't fully descheduled from its runqueue, * in this case we must do a remote wakeup. Its a 'light' wakeup though, * since all we need to do is flip p->state to TASK_RUNNING, since * the task is still ->on_rq. */ static int ttwu_remote(struct task_struct *p, int wake_flags) { struct rq *rq; int ret = 0; rq = __task_rq_lock(p); if (p->on_rq) { ttwu_do_wakeup(rq, p, wake_flags); ret = 1; } __task_rq_unlock(rq); return ret; } /* * Mark the task runnable and perform wakeup-preemption. */ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; } static void ttwu_queue(struct task_struct *p, int cpu) { struct rq *rq = cpu_rq(cpu); raw_spin_lock(&rq->lock); ttwu_do_activate(rq, p, 0); raw_spin_unlock(&rq->lock); } static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) { ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); //检查一下进程是否能够抢占当前进程,再将进程的状态设为TASK_RUNNING。 ttwu_do_wakeup(rq, p, wake_flags); } static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); p->on_rq = 1; /* if a worker is waking up, notify workqueue */ if (p->flags & PF_WQ_WORKER) wq_worker_waking_up(p, cpu_of(rq)); } 附: [PATCH 11/12] sched: Remove the cfs_rq dependency from set_task_cpu() From: Peter Zijlstra Date: Wed Dec 16 2009 - 12:08:23 EST Next message: Uwe Kleine-König: "Re: [PATCH 4/7] can/at91: don't check platform_get_irq's returnvalue against zero" Previous message: Peter Zijlstra: "[PATCH 06/12] sched: Ensure set_task_cpu() is never called on blocked tasks" In reply to: tip-bot for Ingo Molnar: "[tip:sched/urgent] sched: Make warning less noisy" Next in thread: tip-bot for Peter Zijlstra: "[tip:sched/urgent] sched: Remove the cfs_rq dependency from set_task_cpu()" Messages sorted by: [ date ] [ thread ] [ subject ] [ author ] In order to remove the cfs_rq dependency from set_task_cpu() we need to ensure the task is cfs_rq invariant for all callsites. The simple approach is to substract cfs_rq->min_vruntime from se->vruntime on dequeue, and add cfs_rq->min_vruntime on enqueue. However, this has the downside of breaking FAIR_SLEEPERS since we loose the old vruntime as we only maintain the relative position. To solve this, we observe that we only migrate runnable tasks, we do this using deactivate_task(.sleep=0) and activate_task(.wakeup=0), therefore we can restrain the min_vruntime invariance to that state. The only other case is wakeup balancing, since we want to maintain the old vruntime we cannot make it relative on dequeue, but since we don't migrate inactive tasks, we can do so right before we activate it again. This is where we need the new pre-wakeup hook, we need to call this while still holding the old rq->lock. We could fold it into ->select_task_rq(), but since that has multiple callsites and would obfuscate the locking requirements, that seems like a fudge. This leaves the fork() case, simply make sure that ->task_fork() leaves the ->vruntime in a relative state. This covers all cases where set_task_cpu() gets called, and ensures it sees a relative vruntime. Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> --- include/linux/sched.h | 2 +- kernel/sched.c | 6 +----- kernel/sched_fair.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 46 insertions(+), 12 deletions(-) Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -2038,8 +2038,6 @@ task_hot(struct task_struct *p, u64 now, void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct cfs_rq *old_cfsrq = task_cfs_rq(p), - *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); #ifdef CONFIG_SCHED_DEBUG /* @@ -2056,8 +2054,6 @@ void set_task_cpu(struct task_struct *p, perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } - p->se.vruntime -= old_cfsrq->min_vruntime - - new_cfsrq->min_vruntime; __set_task_cpu(p, new_cpu); } @@ -10109,7 +10105,7 @@ void sched_move_task(struct task_struct #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk); + tsk->sched_class->moved_group(tsk, on_rq); #endif if (unlikely(running)) Index: linux-2.6/kernel/sched_fair.c =================================================================== --- linux-2.6.orig/kernel/sched_fair.c +++ linux-2.6/kernel/sched_fair.c @@ -510,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, str curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); delta_exec_weighted = calc_delta_fair(delta_exec, curr); + curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); } @@ -765,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, stru se->vruntime = vruntime; } +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_MIGRATE 2 + static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* + * Update the normalized vruntime before updating min_vruntime + * through callig update_curr(). + */ + if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) + se->vruntime += cfs_rq->min_vruntime; + + /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); account_entity_enqueue(cfs_rq, se); - if (wakeup) { + if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); } @@ -828,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, st __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); + + /* + * Normalize the entity after updating the min_vruntime because the + * update can refer to the ->curr item and we need to reflect this + * movement in our normalized position. + */ + if (!sleep) + se->vruntime -= cfs_rq->min_vruntime; } /* @@ -1038,13 +1057,19 @@ static void enqueue_task_fair(struct rq { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int flags = 0; + + if (wakeup) + flags |= ENQUEUE_WAKEUP; + if (p->state == TASK_WAKING) + flags |= ENQUEUE_MIGRATE; for_each_sched_entity(se) { if (se->on_rq) break; cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, wakeup); - wakeup = 1; + enqueue_entity(cfs_rq, se, flags); + flags = ENQUEUE_WAKEUP; } hrtick_update(rq); @@ -1120,6 +1145,14 @@ static void yield_task_fair(struct rq *r #ifdef CONFIG_SMP +static void task_waking_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + se->vruntime -= cfs_rq->min_vruntime; +} + #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group @@ -1978,6 +2011,8 @@ static void task_fork_fair(struct task_s resched_task(rq->curr); } + se->vruntime -= cfs_rq->min_vruntime; + raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -2031,12 +2066,13 @@ static void set_curr_task_fair(struct rq } #ifdef CONFIG_FAIR_GROUP_SCHED -static void moved_group_fair(struct task_struct *p) +static void moved_group_fair(struct task_struct *p, int on_rq) { struct cfs_rq *cfs_rq = task_cfs_rq(p); update_curr(cfs_rq); - place_entity(cfs_rq, &p->se, 1); + if (!on_rq) + place_entity(cfs_rq, &p->se, 1); } #endif @@ -2076,6 +2112,8 @@ static const struct sched_class fair_sch .move_one_task = move_one_task_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, + + .task_waking = task_waking_fair, #endif .set_curr_task = set_curr_task_fair, Index: linux-2.6/include/linux/sched.h =================================================================== --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -1116,7 +1116,7 @@ struct sched_class { struct task_struct *task); #ifdef CONFIG_FAIR_GROUP_SCHED - void (*moved_group) (struct task_struct *p); + void (*moved_group) (struct task_struct *p, int on_rq); #endif };