• 调度器32—RT选核 Hello


    基于Linux-5.10

    一、RT选核流程

    1. 主要调用路径

    rt_sched_class.select_task_rq //RT调度类回调
        select_task_rq_rt //rt.c 前面trace_android_rvh_select_task_rq_rt()若是选到cpu就直接退出了; 若test或cpu算力不满足时调用
            find_lowest_rq //rt.c
                trace_android_rvh_find_lowest_rq(task, lowest_mask, ret, &cpu);

    二、select_task_rq_rt 函数

    1. 三种选核路径传参

    try_to_wake_up //core.c
        select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); //唤醒选核路径
    
    wake_up_new_task //core.c
        select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0); //fork选核路径
    
    sched_exec //core.c
        select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); //exec选核路径

    注:传参cpu p->wake_cpu 就是p上次运行的cpu.

    static int select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) //rt.c
    {
        struct task_struct *curr;
        struct rq *rq;
        struct rq *this_cpu_rq;
        bool test;
        int target_cpu = -1;
        bool may_not_preempt;
        bool sync = !!(flags & WF_SYNC);
        int this_cpu;
    
        //插入HOOK
        trace_android_rvh_select_task_rq_rt(p, cpu, sd_flag, flags, &target_cpu); //mtk_select_task_rq_rt
        if (target_cpu >= 0)
            return target_cpu;
    
        /* For anything but wake ups, just return the task_cpu */
        //也是只对唤醒和fork新任务场景调用, 另一种 SD_BALANCE_EXEC 的不走这里
        if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
            goto out;
    
        rq = cpu_rq(cpu); //任务上次运行的cpu的rq
    
        rcu_read_lock();
        curr = READ_ONCE(rq->curr); /* unlocked access */ //上次运行的cpu正在执行的任务
        this_cpu = smp_processor_id(); //当前cpu
        this_cpu_rq = cpu_rq(this_cpu); //当前cpu的rq
    
        /*
         * If the current task on @p's runqueue is a softirq task,
         * it may run without preemption for a time that is
         * ill-suited for a waiting RT task. Therefore, try to
         * wake this RT task on another runqueue.
         *
         * Also, if the current task on @p's runqueue is an RT task, then
         * try to see if we can wake this RT task up on another
         * runqueue. Otherwise simply start this RT task
         * on its current runqueue.
         *
         * We want to avoid overloading runqueues. If the woken
         * task is a higher priority, then it will stay on this CPU
         * and the lower prio task should be moved to another CPU.
         * Even though this will probably make the lower prio task
         * lose its cache, we do not want to bounce a higher task
         * around just because it gave up its CPU, perhaps for a
         * lock?
         *
         * For equal prio tasks, we just let the scheduler sort it out.
         *
         * Otherwise, just let it ride on the affined RQ and the
         * post-schedule router will push the preempted task away
         *
         * This test is optimistic, if we get it wrong the load-balancer
         * will have to sort it out.
         *
         * We take into account the capacity of the CPU to ensure it fits the
         * requirement of the task - which is only important on heterogeneous
         * systems like big.LITTLE.
         */
        //主要是判断几类softirq,返回假表示可抢占,curr表示任务p上次运行的cpu上当前运行的任务
        may_not_preempt = task_may_not_preempt(curr, cpu);
        //任务p之前运行的cpu上正在运行的任务当前不可被抢占或是绑核的RT,或优先级比当前任务还高的RT
        test = (curr && (may_not_preempt || (unlikely(rt_task(curr)) && (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio))));
    
        /*
         * Respect the sync flag as long as the task can run on this CPU.
         */
        //若是被RT任务sync唤醒且当前cpu上正在运行RT任务的优先级比p低,且当前cpu在任务p的亲和性中,就选当前cpu
        if (should_honor_rt_sync(this_cpu_rq, p, sync) && cpumask_test_cpu(this_cpu, p->cpus_ptr)) {
            cpu = this_cpu;
            goto out_unlock;
        }
    
        /*
         * 若p不能运行在之前运行的cpu上,或p之前运行的cpu算力不满足p的需求了,才进行后续的选核。
         *
         * 这个条件判断应该很可能为假,也即p可以运行在之前运行的cpu上且之前运行的cpu满足p的算力需求。也就是说
         * 任务p很可能运行在之前运行过的cpu上,==> RT线程对算力满足需求的之前运行过的cpu有亲和性!一定概率下不
         * 会走后续的选核流程。
         */
        if (test || !rt_task_fits_capacity(p, cpu)) {
            //这里是主要的选核逻辑
            int target = find_lowest_rq(p);
    
            /*
             * Bail out if we were forcing a migration to find a better
             * fitting CPU but our search failed.
             */
            /*
             * 若p能运行在之前运行的cpu上,且这里选出的cpu也不满足算力需求,就选任务p之前运行的cpu,
             * 即使之前运行的cpu的算力也不满足. ==> 对之前运行过的cpu有亲和性
             */
            if (!test && target != -1 && !rt_task_fits_capacity(p, target))
                goto out_unlock;
    
            /*
             * If cpu is non-preemptible, prefer remote cpu
             * even if it's running a higher-prio task.
             * Otherwise: Don't bother moving it if the destination CPU is
             * not running a lower priority task.
             */
            /*
             * 选出了目标cpu且,且p不能抢占之前运行的cpu或p的优先级高于选出的cpu的rq上最高优任务的先级,就选新
             * 选出的cpu,否则不赋值,还是选之前cpu。
             */
            if (target != -1 && (may_not_preempt || p->prio < cpu_rq(target)->rt.highest_prio.curr))
                cpu = target;
        }
    
    out_unlock:
        rcu_read_unlock();
    
    out:
        return cpu;
    }

    2. 函数总结:
    (1) 若是没有选到目标cpu,就返回任务p上次运行的cpu。
    (2) trace_android_rvh_select_task_rq_rt 这个hook中传递了上层的所有参数,vendor可以在这里定制选核逻辑。
    (3) 只有唤醒场景和fork新任务场景的才走选核流程,exec执行场景的选核直接返回之前运行的cpu作为目标cpu。
    (4) 若是被RT任务sync唤醒且当前cpu上正在运行RT任务的优先级比p低,且当前cpu在任务p的亲和性中,就选当前cpu作为目标cpu。
    (5) 若p不能运行在之前运行的cpu上,或p之前运行的cpu算力不满足p的需求了,才会继续选核,否则选p之前运行的cpu。说明RT任务对之前运行的cpu有一定的“亲和性”。
    (6) 主要的选核逻辑在 find_lowest_rq() 中。

    三、find_lowest_rq 函数

    1. select_task_rq_rt 传参为待选核的任务

    static int find_lowest_rq(struct task_struct *task)
    {
        struct sched_domain *sd;
        //static全局变量,使用之前还是空的
        struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
        int this_cpu = smp_processor_id(); //当前正在运行的cpu
        int cpu      = -1;
        int ret;
    
        /* Make sure the mask is initialized first */
        if (unlikely(!lowest_mask))
            return -1;
    
        //对于绑核的RT线程直接返回
        if (task->nr_cpus_allowed == 1)
            return -1; /* No other targets possible */
    
        /*
         * If we're on asym system ensure we consider the different capacities
         * of the CPUs when searching for the lowest_mask.
         */
        if (static_branch_unlikely(&sched_asym_cpucapacity)) {
            //这个完全执行在前,lowest_mask 里面要么都是满足算力需求的cpu,要么都是不满足算力需求的cpu(之后大概率选之前的cpu)
            ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, task, lowest_mask, rt_task_fits_capacity);
        } else {
            ret = cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask);
        }
    
        //走到这里时,lowest_mask中可能是满足算力需求的cpu,也可能不是。
        //这个hook中vendor可以修改候选cpu。
        trace_android_rvh_find_lowest_rq(task, lowest_mask, ret, &cpu); //HOOK
        if (cpu >= 0)
            return cpu;
    
        if (!ret)
            return -1; /* No targets found */
    
        cpu = task_cpu(task); //待选核rt任务之前运行的cpu
    
        /*
         * At this point we have built a mask of CPUs representing the
         * lowest priority tasks in the system.  Now we want to elect
         * the best one based on our affinity and topology.
         *
         * We prioritize the last CPU that the task executed on since
         * it is most likely cache-hot in that location.
         */
        //若p之前运行的cpu在候选cpu中,那么就选之前运行的cpu,以便利用cache-hot特性
        if (cpumask_test_cpu(cpu, lowest_mask))
            return cpu;
    
        /*
         * Otherwise, we consult the sched_domains span maps to figure
         * out which CPU is logically closest to our hot cache data.
         * 翻译:
         * 否则,我们会查阅 sched_domains 中的cpu以确定哪个 CPU 在逻辑上最
         * 接近我们的热缓存数据。
         */
        //若当前cpu不在候选cpu中就将 this_cpu 设为-1
        if (!cpumask_test_cpu(this_cpu, lowest_mask))
            this_cpu = -1; /* Skip this_cpu opt if not among lowest */
    
        rcu_read_lock();
        for_each_domain(cpu, sd) { //MC-->DIE
            if (sd->flags & SD_WAKE_AFFINE) { //MC和DIE都有这个标志
                int best_cpu;
    
                /* "this_cpu" is cheaper to preempt than a remote processor.*/
                /*
                 * 当前cpu在候选cpu中且当前cpu和p之前运行的cpu在同一个cluster内(MC的span为本cluster,DIE的span为所有cpu),
                 * 就返回当前cpu作为目标cpu.
                 */
                if (this_cpu != -1 && cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
                    rcu_read_unlock();
                    return this_cpu;
                }
    
                //选候选cpu和sd->span交集的第一个cpu做为目标cpu
                best_cpu = cpumask_first_and(lowest_mask, sched_domain_span(sd));
                if (best_cpu < nr_cpu_ids) {
                    rcu_read_unlock();
                    return best_cpu;
                }
            }
        }
        //对于手机,上面肯定已经返回了,下面不会执行---------------------------------------。
        
        rcu_read_unlock();
    
        /*
         * And finally, if there were no matches within the domains
         * just give the caller *something* to work with from the compatible
         * locations.
         */
        //若到此还没选到任何cpu,且当前cpu在候选cpu中,就选当前cpu吧。
        if (this_cpu != -1)
            return this_cpu;
    
        //从候选cpu中任选一个cpu作为目标cpu
        cpu = cpumask_any(lowest_mask);
        if (cpu < nr_cpu_ids)
            return cpu;
    
        return -1;
    }

    2. 函数总结:
    (1) 先调用 cpupri_find_fitness() 候选cpu放到 lowest_mask 中,由于此函数在选不到候选cpu的时候后舍去 fitness_fn 回调重新选择一次。因此lowest_mask 中的候选cpu可能是都是算力满足待选核任务p需求的,或是都不满足p需求的。
    (2) trace_android_rvh_find_lowest_rq 允许vendor厂商插入hook来更改候选cpu或指定目标cpu
    (3) 确定候选cpu的lowest_mask后,选择优先级为:
    a. 若p之前运行的cpu在候选cpu中,那么就选之前运行的cpu,以便利用cache-hot特性。
    b. 若当前cpu在候选cpu中,且当前cpu和p之前运行的cpu位于同一cluster,则选当前cpu。
    c. 选候选cpu和sd->span交集的第一个cpu做为目标cpu,即选任务p之前运行的cluster的一个cpu。
    d. 若当前cpu在候选cpu中,则选当前cpu。
    e. 选候选cpu中的第一个cpu。

    四、cpupri_find_fitness 函数

    1. find_lowest_rq调用传参(&task_rq(task)->rd->cpupri, task, lowest_mask, rt_task_fits_capacity)

    cp 是全局唯一的,p 是待选核任务,lowest_mask 是刚初始化还没使用的,fitness_fn 是回调 rt_task_fits_capacity

    int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
        struct cpumask *lowest_mask, bool (*fitness_fn)(struct task_struct *p, int cpu)) //cpupri.c
    {
        int task_pri = convert_prio(p->prio);
        int idx, cpu;
        bool drop_nopreempts = task_pri <= MAX_RT_PRIO; //100 只有prio=0的最高优先级的RT任务不满足
    
        BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); //102 convert_prio 转换后最大是101
    
    #ifdef CONFIG_RT_SOFTINT_OPTIMIZATION
    retry:
    #endif
        //RT优先级越高就越大
        for (idx = 0; idx < task_pri; idx++) {
            //若选到了cpu,__cpupri_find 返回1
            if (!__cpupri_find(cp, p, lowest_mask, idx, drop_nopreempts))
                continue;
    
            //两个指针若有一个为NULL就直接返回
            if (!lowest_mask || !fitness_fn)
                return 1;
    
            /* Ensure the capacity of the CPUs fit the task */
            //对于 lowest_mask 中选出的cpu,剔除算力不满足需求的cpu。
            for_each_cpu(cpu, lowest_mask) {
                if (!fitness_fn(p, cpu))
                    cpumask_clear_cpu(cpu, lowest_mask);
            }
    
            /*
             * If no CPU at the current priority can fit the task
             * continue looking
             */
            if (cpumask_empty(lowest_mask))
                continue;
    
            //一般情况下,选到核了就从这里返回了
            return 1;
        }
    
        /*
         * If we can't find any non-preemptible cpu's, retry so we can
         * find the lowest priority target and avoid priority inversion.
         */
    #ifdef CONFIG_RT_SOFTINT_OPTIMIZATION
        //大概率不执行
        if (drop_nopreempts) {
            drop_nopreempts = false;
            goto retry;
        }
    #endif
    
        /*
         * If we failed to find a fitting lowest_mask, kick off a new search
         * but without taking into account any fitness criteria this time.
         *
         * This rule favours honouring priority over fitting the task in the
         * correct CPU (Capacity Awareness being the only user now).
         * The idea is that if a higher priority task can run, then it should
         * run even if this ends up being on unfitting CPU.
         *
         * The cost of this trade-off is not entirely clear and will probably
         * be good for some workloads and bad for others.
         *
         * The main idea here is that if some CPUs were overcommitted, we try
         * to spread which is what the scheduler traditionally did. Sys admins
         * must do proper RT planning to avoid overloading the system if they
         * really care.
         */
        /*
         * 若还是没有选到核,走这里,其是不再提供过滤回调函数,再重新调用一次
         * cpupri_find_fitness(), 这次就不考虑RT任务算力需求了,__cpupri_find()
         * 选到核后就直接返回了。
         * TODO: 此情况下可以尽量选中核大核。
         */
        if (fitness_fn)
            return cpupri_find(cp, p, lowest_mask);
    
        return 0;
    }
    EXPORT_SYMBOL_GPL(cpupri_find_fitness);
    
    
    // cpupri_find_fitness传参:(cp, p, lowest_mask)
    int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask)
    {
        return cpupri_find_fitness(cp, p, lowest_mask, NULL);
    }
    
    
    /*
     * cpupri_find_fitness 传参:(cp, p, lowest_mask, idx, drop_nopreempts)
     * drop_nopreempts 只有 p->prio=0 的最高RT优先级才会为真.
     * 选到了cpu返回真。
     */
    static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
        struct cpumask *lowest_mask, int idx, bool drop_nopreempts)
    {
        struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
        int skip = 0;
    
        if (!atomic_read(&(vec)->count))
            skip = 1;
    
        smp_rmb();
    
        /* Need to do the rmb for every iteration */
        if (skip)
            return 0;
    
        if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
            return 0;
    
        if (lowest_mask) {
            //与两次
            cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
            cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
    
    #ifdef CONFIG_RT_SOFTINT_OPTIMIZATION
            if (drop_nopreempts)
                drop_nopreempt_cpus(lowest_mask);
    #endif
    
            /*
             * We have to ensure that we have at least one bit
             * still set in the array, since the map could have
             * been concurrently emptied between the first and
             * second reads of vec->mask.  If we hit this
             * condition, simply act as though we never hit this
             * priority level and continue on.
             */
            if (cpumask_empty(lowest_mask))
                return 0;
        }
    
        return 1;
    }

    2. 函数总结:
    (1) 会先带着过滤回调函数fitness_fn选一次候选cpu,若是没有选到,就取消过滤函数回调重新选择一次。

  • 相关阅读:
    关于Thread ThreadPool Parallel 的一些小测试demo
    VS附加到进程调试
    netcore 实现一个简单的Grpc 服务端和客户端
    CodeSmith 找不到请求的 .Net Framework Data Provider
    ocelot集成consul服务发现
    使用ocelot作为api网关
    关于add migration 报错的问题解决方案
    关于多线程efcore dbcontext 的解决方案。
    docker mysql 容器报too many connections 引发的liunx磁盘扩容操作
    关于liunx 机器脱机环境(netcore)Nuget包迁移的问题
  • 原文地址:https://www.cnblogs.com/hellokitty2/p/15881574.html
Copyright © 2020-2023  润新知