• Linux内核源码—CFS调度(4.20.17)


     cfs_rq

    每个 cpu 都有一个对应的运行队列 rq,在 rq 中维护着不同调度策略的调度队列。

    struct rq {
            ...
        struct cfs_rq       cfs;
        struct rt_rq        rt;
        struct dl_rq        dl;
            ...   
    };
    

    cfs的调度队列通过红黑树维护,在 cfs_rq 的数据结构中,struct rb_root_cached   tasks_timeline 包含了红黑树 struct rb_root rb_root 和 最左叶子节点缓存 struct rb_node *rb_leftmost 。

    struct cfs_rq {
    	struct load_weight	load;  //CFS运行队列的负载权重值
    	unsigned long		runnable_weight;
    	unsigned int		nr_running;
    	unsigned int		h_nr_running;
    
    	u64			exec_clock;
    	u64			min_vruntime;
    #ifndef CONFIG_64BIT
    	u64			min_vruntime_copy;
    #endif
    
    	struct rb_root_cached	tasks_timeline;  //红黑树,维护调度实体
    
    	/*
    	 * 'curr' points to currently running entity on this cfs_rq.
    	 * It is set to NULL otherwise (i.e when none are currently running).
    	 */
    	struct sched_entity	*curr;  //当前运行的调度实体
    	struct sched_entity	*next;  //下一个调度实体
    	struct sched_entity	*last;  //队列中最后的调度实体
    	struct sched_entity	*skip;  //跳过的调度实体
    
    #ifdef	CONFIG_SCHED_DEBUG
    	unsigned int		nr_spread_over;
    #endif
    
    #ifdef CONFIG_SMP
    	/*
    	 * CFS load tracking
    	 */
    	struct sched_avg	avg;
    #ifndef CONFIG_64BIT
    	u64			load_last_update_time_copy;
    #endif
    	struct {
    		raw_spinlock_t	lock ____cacheline_aligned;
    		int		nr;
    		unsigned long	load_avg;
    		unsigned long	util_avg;
    		unsigned long	runnable_sum;
    	} removed;
    
    #ifdef CONFIG_FAIR_GROUP_SCHED
    	unsigned long		tg_load_avg_contrib;
    	long			propagate;
    	long			prop_runnable_sum;
    
    	/*
    	 *   h_load = weight * f(tg)
    	 *
    	 * Where f(tg) is the recursive weight fraction assigned to
    	 * this group.
    	 */
    	unsigned long		h_load;
    	u64			last_h_load_update;
    	struct sched_entity	*h_load_next;
    #endif /* CONFIG_FAIR_GROUP_SCHED */
    #endif /* CONFIG_SMP */
    
    #ifdef CONFIG_FAIR_GROUP_SCHED
    	struct rq		*rq;	/* CPU runqueue to which this cfs_rq is attached */
    
    	/*
    	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
    	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
    	 * (like users, containers etc.)
    	 *
    	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
    	 * This list is used during load balance.
    	 */
    	int			on_list;
    	struct list_head	leaf_cfs_rq_list;
    	struct task_group	*tg;	/* group that "owns" this runqueue */
    
    #ifdef CONFIG_CFS_BANDWIDTH
    	int			runtime_enabled;
    	int			expires_seq;
    	u64			runtime_expires;
    	s64			runtime_remaining;
    
    	u64			throttled_clock;
    	u64			throttled_clock_task;
    	u64			throttled_clock_task_time;
    	int			throttled;
    	int			throttle_count;
    	struct list_head	throttled_list;
    #endif /* CONFIG_CFS_BANDWIDTH */
    #endif /* CONFIG_FAIR_GROUP_SCHED */
    };
    

    vruntime

    那么CFS是根据什么来对任务进行排序呢?----------》虚拟运行时间 vruntime

    update_curr 函数(/kernel/sched/fair.c)实现了 vruntime 的更新,其步骤是计算出当前进程的运行时间 delta_exec,再结合当前可运行进程总数对delta_exec 进行加权运算。

    static void update_curr(struct cfs_rq *cfs_rq)
    {
    	struct sched_entity *curr = cfs_rq->curr;  //获取当前调度实体
    	u64 now = rq_clock_task(rq_of(cfs_rq));  //获取当前时间
    	u64 delta_exec;
    
    	if (unlikely(!curr))
    		return;
    
    	delta_exec = now - curr->exec_start;  //计算当前进程已执行的时间,exec_start是调度实体的开始执行时间
    	if (unlikely((s64)delta_exec <= 0))
    		return;
    
    	curr->exec_start = now;
    
    	schedstat_set(curr->statistics.exec_max,
    		      max(delta_exec, curr->statistics.exec_max));
    
    	curr->sum_exec_runtime += delta_exec;  //修改调度实体已执行总时间
    	schedstat_add(cfs_rq->exec_clock, delta_exec);
    
    	curr->vruntime += calc_delta_fair(delta_exec, curr);  //修改调度实体虚拟运行时间
    	update_min_vruntime(cfs_rq);
    
    	if (entity_is_task(curr)) {  //如果调度实体是task,也要给它的调度组记录执行时间
    		struct task_struct *curtask = task_of(curr);
    
    		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
    		cgroup_account_cputime(curtask, delta_exec);
    		account_group_exec_runtime(curtask, delta_exec);
    	}
    
    	account_cfs_rq_runtime(cfs_rq, delta_exec);
    }
    

    calc_delta_fair(delta_exec, curr) 实现了虚拟运行时间的计算:

    虚拟运行时间 = delta_exec * NICE_0_LOAD / 当前进程的权重

    而具体在 __calc_delta 中,是通过(delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT 实现的,通过左移和右移避免浮点运算。

    从公式可以得出,如果一个进程的虚拟运行时间越小,说明实际运行的时间越少或者是进程的权重大,那么就应该具有更高的优先度。而红黑树维护的就是进程的 vruntime 值,每次选择 vruntime 最小的进程执行,该节点缓存在了最左叶子节点 struct rb_node *rb_leftmost 中。

    static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
    {
    	if (unlikely(se->load.weight != NICE_0_LOAD))
    		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
    
    	return delta;
    }

      

    进程选择

    在进程变为可运行状态(被唤醒)或者是通过 fork() 调用第一次创建进程时,需要将进程插入红黑树,调用 __enqueue_entity 实现这一过程。删除节点也是同样的道理。

    static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
    {
    	struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;  //红黑树根节点
    	struct rb_node *parent = NULL;
    	struct sched_entity *entry;
    	bool leftmost = true;
    
    	/*
    	 * Find the right place in the rbtree:
    	 */
    	while (*link) {
    		parent = *link;
    		entry = rb_entry(parent, struct sched_entity, run_node);  //rb_entry 只是 container_of 的封装而已,找到首地址
    		/*
    		 * We dont care about collisions. Nodes with
    		 * the same key stay together.
    		 */
    		if (entity_before(se, entry)) {
    			link = &parent->rb_left;
    		} else {
    			link = &parent->rb_right;
    			leftmost = false;
    		}
    	}
    
    	rb_link_node(&se->run_node, parent, link);  //在红黑树中插入节点
    	rb_insert_color_cached(&se->run_node,  //设置节点的颜色
    			       &cfs_rq->tasks_timeline, leftmost);
    }
    

      

    进程调度

    进程调度的主要入口点是函数 schedule(/kernel/sched/core.c),它通过 pick_next_task() 函数选择下一个进程,如果选出来的进程与当前运行进程不一致,则调用 context_switch() 函数进行上下文切换

    static void __sched notrace __schedule(bool preempt)
    {
    	cpu = smp_processor_id();
    	rq = cpu_rq(cpu);
    	prev = rq->curr;  //获取当前运行进程
    
            ...
    
    	next = pick_next_task(rq, prev, &rf);
    	clear_tsk_need_resched(prev);
    	clear_preempt_need_resched();
    
    	if (likely(prev != next)) {
    ... rq = context_switch(rq, prev, next, &rf); } else { rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); rq_unlock_irq(rq, &rf); }
    ... }

    pick_next_task() 函数的实现并不复杂,这里用到了一点优化,如果所有的可运行进程都在 cfs 中,那么就可以直接调用 cfs 的 pick_next_task(), 否则就需要按照调度器的优先级来选择。

    static inline struct task_struct *
    pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
    {
    	const struct sched_class *class;
    	struct task_struct *p;
    
    	/*
    	 * Optimization: we know that if all tasks are in the fair class we can
    	 * call that function directly, but only if the @prev task wasn't of a
    	 * higher scheduling class, because otherwise those loose the
    	 * opportunity to pull in more work from other CPUs.
    	 */
    	if (likely((prev->sched_class == &idle_sched_class ||
    		    prev->sched_class == &fair_sched_class) &&
    		   rq->nr_running == rq->cfs.h_nr_running)) {
    
    		p = fair_sched_class.pick_next_task(rq, prev, rf);
    		if (unlikely(p == RETRY_TASK))
    			goto again;
    
    		/* Assumes fair_sched_class->next == idle_sched_class */
    		if (unlikely(!p))
    			p = idle_sched_class.pick_next_task(rq, prev, rf);
    
    		return p;
    	}
    
    again:
    	for_each_class(class) {
    		p = class->pick_next_task(rq, prev, rf);
    		if (p) {
    			if (unlikely(p == RETRY_TASK))
    				goto again;
    			return p;
    		}
    	}
    
    	/* The idle class should always have a runnable task: */
    	BUG();
    }
    

      

    References

    1. 【原创】(五)Linux进程调度-CFS调度器
    2. CFS调度主要代码分析一
  • 相关阅读:
    C# 自定义配置文件
    Mysql JSON字段提取某一个属性值的函数
    linux查看Java线程
    Idea书签管理器的使用
    springboot寻找property的顺序
    SpringBoot的spring-boot-starter有哪些(官方)
    SpringBoot打成jar包的配置方式
    netstat 常用参数总结
    Sentinel统计线程,QPS,RT的方式
    16. kubernetes RBAC
  • 原文地址:https://www.cnblogs.com/zyb993963526/p/15979450.html
Copyright © 2020-2023  润新知