1. 程式人生 > 其它 >排程器20—負載均衡—load_balance()函式分析

排程器20—負載均衡—load_balance()函式分析

一、概述

1. 負載均衡的情景包括 tick balance、nohz idle balance 和 new idle balance,最終都會匯聚到 load_balance 函式來完成具體的負載均衡工作。


二、load_balance 相關資料結構

1. struct lb_env

在負載均衡的時候,通過 lb_env 結構來表示本次負載均衡的上下文:

//fair.c
struct lb_env {
    //要進行負載均衡的domain
    struct sched_domain    *sd;

    //此sd中最忙的cpu和rq,均衡目標就是從其中拉取任務
    struct rq        *src_rq;
    
int src_cpu; /* * 本次均衡的目標CPU,均衡嘗試從sd中的最忙的cpu的rq上拉取任務到dst cpu的rq上, * 第一輪均衡的dst cpu通常為發起均衡的cpu,但後續若有需要,可以從新設定為local * group中其它的cpu. */ int dst_cpu; struct rq *dst_rq; //dst cpu所在sched group的cpu mask,MC層級就是dst cpu自己,DIE層級是其cluster. struct
cpumask *dst_grpmask; /* * 一般而言,均衡的dst cpu是發起均衡的cpu,但如果由於affinity原因,src上有任務 * 無法遷移到dst cpu從而無法完成負載均衡操作時,會從dst cpu的logcal group中選出 * 一個新的cpu發起第二輪負載均衡。 */ int new_dst_cpu; //均衡時dst cpu的idle狀態,其會影響負載均衡的走向 enum cpu_idle_type idle; /* * 對此成員的解釋需要結合migration_type成員, calculate_imbalance: * migrate_load:表示要遷移的負載量 * migrate_util:表示要遷移的utility * migrate_task:MC:表示要遷移的任務個數,DIE: busiest group需要增加的idle cpu個數 * migrate_misfit:設定為1,表示一次遷移一個任務 * group_imbalanced:設定為1,表示一次遷移一個任務
*/ long imbalance; /* The set of CPUs under consideration for load-balancing */ /* * 負載均衡過程會有多輪操作,不同輪次的操作會涉及不同cpus,此成員表示此次均衡 * 有哪些cpus參與 */ struct cpumask *cpus; /* * 負載均衡標誌,位掩碼。LBF_NOHZ_STATS 和 LBF_NOHZ_AGAIN 主要用於均衡過程中更 * 新nohz狀態。當選中的最忙的cpu上所有任務都由於affinity無法遷移時會設定 * LBF_ALL_PINNED,此時會尋找次忙的cpu進行下一輪均衡。LBF_NEED_BREAK 主要用於 * 減短均衡過程中關中斷的時間的。 */ unsigned int flags; /* * 當確定要遷移任務時,load_balance()會迴圈遍歷src rq上的cfs task連結串列來確定遷移 * 的任務數量。loop用於跟蹤迴圈次數,其值不能超過loop_max成員。 */ unsigned int loop; /* * 如果一次遷移的任務比較多,那麼每遷移 sched_nr_migrate_break 個任務就要休息一 * 下,讓關中斷的臨界區小一點。 */ unsigned int loop_break; unsigned int loop_max; enum fbq_type fbq_type; /* * 要達到sd負載均衡的目標,本次遷移的型別是什麼,遷移一定量的負載、一定量的utility、 * 一些任務還是misfit task。見 imbalance 成員的解釋。 */ enum migration_type migration_type; //需要遷移的任務會掛到這個連結串列中 struct list_head tasks; struct rq_flags *src_rq_rf; };

2、struct sd_lb_stats

在負載均衡的時候,通過 sd_lb_stats 結構來表示 sched domain 的負載統計資訊:

struct sd_lb_stats {
    //該sd中最忙的那sg,非local group
    struct sched_group *busiest;
    //均衡時用於標記sd中哪個group是local group,即dst cpu所在的group
    struct sched_group *local;
    //此sd中所有sg的負載之和。若無特別說明,這裡的負載指的是cfs任務的負載
    unsigned long total_load;
    //此sd中所有sg的cpu算力之和(可用於cfs任務的算力)
    unsigned long total_capacity;
    //該sd中所有sg的平均負載
    unsigned long avg_load;
    //標記任務應該先去到同cluster的cpu
    unsigned int prefer_sibling;
    //該sd中最忙的那個sg的負載統計資訊
    struct sg_lb_stats busiest_stat;
    //dst cpu所在的本地sg的負載統計資訊
    struct sg_lb_stats local_stat;
};

3、struct sg_lb_stats

在負載均衡的時候,通過 sg_lb_stats 結構來表示 sched group 的負載統計資訊:

struct sg_lb_stats {
    /*
     * 該sg上所有cpu的平均負載。僅在sg處於 group_overloaded
     * 狀態下才計算該值,方便計算遷移負載量
     */
    unsigned long avg_load;
    //該sg上所有cpu的負載之和
    unsigned long group_load;
    //該sg上所有cpu的可用於cfs任務的算力之和
    unsigned long group_capacity;
    //該sg上所有cpu的利用率之和
    unsigned long group_util;
    //該sg上所有cpu的執行負載之和
    unsigned long group_runnable;
    //該sg上所有任務的數量,包括rt、dl任務
    unsigned int sum_nr_running;
    //該sg上所有cfs任務的數量
    unsigned int sum_h_nr_running;
    //該sg中idle cpu的數量
    unsigned int idle_cpus;
    //該sg中cpu的數量
    unsigned int group_weight;
    //該sg在負載均衡時所處的狀態
    enum group_type group_type;
    //標記任務需要被遷移到偏愛的cpu, update_sg_lb_stats中判斷了若sd指定了 SD_ASYM_PACKING 才可能賦值,是不會賦值的
    unsigned int group_asym_packing;
    //該sg中至少有一個cpu上有misfit task,這裡記錄該sg所有cpu的misfit task load的最大值
    unsigned long group_misfit_task_load;
};

4、struct sched_group_capacity

用來描述 sched group 的算力資訊:

struct sched_group_capacity {
    //引用計算,可能多個sd共享一個sg和sgc
    atomic_t        ref;
    //該sg中可用於cfs任務的總算力(約為此sg中各個cpu的算力之和)
    unsigned long        capacity;
    //該sg中最小可用於cfs任務的算力(對於單個cpu而言的)
    unsigned long        min_capacity;
    //該sg中最大可用於cfs任務的算力(對於單個cpu而言的)
    unsigned long        max_capacity;
    //下一次更新算力的時間點
    unsigned long        next_update;
    //該sg中是否有由於affinity原因產生不均衡的問題
    int            imbalance;
#ifdef CONFIG_SCHED_DEBUG
    //MC層級的是每個cpu的id,DIE層級的是每個cluster的首個cpu的id
    int            id;
#endif
    //該sg包含的cpu
    unsigned long        cpumask[];
};

三、load_balance 函式

先整體看下 load_balance(),之後再對其各個邏輯進行介紹

/*
 * 引數:
 * this_cpu/this_rq:發起本次負載均衡的cpu和其對應的rq
 * sd:本次均衡的範圍,即本次均衡要保證該sd上各個sg處於負載均衡狀態
 * idle:this_cpu在發起均衡時所處於的狀態,通過這個狀態可以識別是 new idle blance 還是 tick blance.
 * continue_balancing:均衡是從發起cpu的base domain開始,不斷向上,直到頂層sd,此引數用來控制是否繼續進行上層sd的均衡。
 *
 * 返回值:本次負載均衡遷移的任務總數
 */
static int load_balance(int this_cpu, struct rq *this_rq,
            struct sched_domain *sd, enum cpu_idle_type idle,
            int *continue_balancing)
{
    int ld_moved, cur_ld_moved, active_balance = 0;
    struct sched_domain *sd_parent = sd->parent; //上級sd,即DIE層級
    struct sched_group *group;
    struct rq *busiest;
    struct rq_flags rf;
    //這裡是唯一使用位置,先使用後賦值,per-cpu的全域性變數
    struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);

    struct lb_env env = {
        .sd        = sd,
        .dst_cpu    = this_cpu, //dst cpu通常是發起均衡的cpu
        .dst_rq        = this_rq,
        .dst_grpmask    = sched_group_span(sd->groups), //MC:就是本cpu,DIE:同cluster的cpus
        .idle        = idle,
        .loop_break    = sched_nr_migrate_break,
        .cpus        = cpus,
        .fbq_type    = all,
        .tasks        = LIST_HEAD_INIT(env.tasks),
    };

    /*
     * 只在active的cpu之間做均衡,active就是非isolate和非offline的cpu
     *
     * 由於是第一輪均衡,sd的所有cpu都要參與,後續若發現一些異常狀況,
     * 比如affinity導致無法完成任務遷移,那麼會清除選定的busiest cpu,
     * 跳轉到redo標號處進行新的一輪均衡。
     *
     * MC: 是一個cluster的cpu, DIE:是所有的cpu。也就是說若傳參sd是MC
     * 層級的就只在dst cpu cluster內部均衡,若是DIE層級的就在所有cluster
     * 的核之間均衡。
     */
    cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);

    //對應的idle type 的 balance計算加1,在cat /proc/schedstat 中列印
    schedstat_inc(sd->lb_count[idle]);

redo:
    //對哪些cpu可以發起均衡做一個限制
    if (!should_we_balance(&env)) {
        /*如果判斷為不適合均衡了,那麼後續更高層sd的均衡也不需要進行了,將其設定為0*/
        *continue_balancing = 0;
        goto out_balanced;
    }

    /*在該sd中尋找最忙的sg,如果沒有找到就退出本level的均衡*/
    group = find_busiest_group(&env);
    if (!group) {
        schedstat_inc(sd->lb_nobusyg[idle]);
        goto out_balanced;
    }

    /*在找出的最忙的sg中尋找最忙的cpu,如果沒有找到就退出本level的均衡*/
    busiest = find_busiest_queue(&env, group);
    if (!busiest) {
        schedstat_inc(sd->lb_nobusyq[idle]);
        goto out_balanced;
    }

    /*
     * 至此就找到了最忙的src cpu, dst cpu就是發起均衡的cpu, 至此,就可以發起第一輪負載均衡了。
     * 找出的最忙的cpu不能是發起均衡的cpu
     */
    BUG_ON(busiest == env.dst_rq);

    //增加統計計數
    schedstat_add(sd->lb_imbalance[idle], env.imbalance);

    //將找到的最忙的cpu更新到lb_env這個均衡上下文中
    env.src_cpu = busiest->cpu;
    env.src_rq = busiest;

    /*要從busiest cpu遷移任務到this cpu, 至少要有可拉取的任務*/
    ld_moved = 0;
    if (busiest->nr_running > 1) {
        /*
         * Attempt to move tasks. If find_busiest_group has found
         * an imbalance but busiest->nr_running <= 1, the group is
         * still unbalanced. ld_moved simply stays zero, so it is
         * correctly treated as an imbalance.
         */
        /*
         * 拉取任務之前先假定all pinned標誌,若後續在can_migrate_task()中發現至少有一個任務可
         * 以遷移到dst cpu上時就清除這個標誌
         */
        env.flags |= LBF_ALL_PINNED;
        /* 
         * loop_max就是掃描src rq上runnable任務的次數,取busiest->nr_running,但是被鉗位在
         * sysctl_sched_nr_migrate上,因為一次遷移任務不宜過多,因為關中斷時間不宜過長。
         */
        env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);

        /*
         * 這個標號和redo不同,不需要判斷是否需要balance和尋找最忙cpu,只需要繼續掃描busiest
         * rq上的任務列表,尋找適合遷移的任務。
         */
more_balance:
        rq_lock_irqsave(busiest, &rf);
        env.src_rq_rf = &rf;
        //更新 busiest->clock
        update_rq_clock(busiest);

        /*
         * cur_ld_moved - load moved in current iteration
         * ld_moved     - cumulative load moved across iterations
         */
        /*
         * 至此,我們已經確定了從busiest cpu的rq中搬移若干 load/util/task到dst rq。不過無
         * 論是load還是util,最後還是要轉成任務。
         * 此函式用來從busiest cpu的rq中摘取適合的任務,並把這些任務掛入lb_env->tasks連結串列
         * 中。由於關中斷時長的問題,此函式也不會一次性把所有任務遷移到dest cpu上。
         */
        cur_ld_moved = detach_tasks(&env);

        /*
         * We've detached some tasks from busiest_rq. Every
         * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
         * unlock busiest->lock, and we are able to be sure
         * that nobody can manipulate the tasks in parallel.
         * See task_rq_lock() family for the details.
         */

        rq_unlock(busiest, &rf);

        /*
         * 將 detach_tasks() 摘下的任務掛入到 dst rq上去。由於 detach_tasks、attach_tasks 會
         * 進行多輪,ld_moved 記錄了總共遷移的任務數量,cur_ld_moved 是本輪遷移的任務數
         */
        if (cur_ld_moved) {
            attach_tasks(&env);
            ld_moved += cur_ld_moved;
        }

        local_irq_restore(rf.flags);

        /*
         * 在任務遷移過程中,src cpu 也就是找出的最忙的那個cpu的中斷是關閉的,為了降低這個關
         * 中斷的時間,遷移大量任務的時候需要break一下。就是上面的關中斷。
         * detach_tasks 中判斷掃描src rq的次數大於 env->loop_break 時置此標誌位並退出它那次迴圈
         */
        if (env.flags & LBF_NEED_BREAK) {
            env.flags &= ~LBF_NEED_BREAK;
            goto more_balance;
        }

        /*
         * Revisit (affine) tasks on src_cpu that couldn't be moved to
         * us and move them to an alternate dst_cpu in our sched_group
         * where they can run. The upper limit on how many times we
         * iterate on same src_cpu is dependent on number of CPUs in our
         * sched_group.
         *
         * This changes load balance semantics a bit on who can move
         * load to a given_cpu. In addition to the given_cpu itself
         * (or a ilb_cpu acting on its behalf where given_cpu is
         * nohz-idle), we now have balance_cpu in a position to move
         * load to given_cpu. In rare situations, this may cause
         * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
         * _independently_ and at _same_ time to move some load to
         * given_cpu) causing exceess load to be moved to given_cpu.
         * This however should not happen so much in practice and
         * moreover subsequent load balance cycles should correct the
         * excess load moved.
         */
        /*
         * 至此,已經完成了對 src rq上任務列表 loop_max 次的掃描,要看情況是否要發起下一輪次的均衡
         *
         * LBF_DST_PINNED 標誌是在     can_migrate_task()中判斷dst cpu不再任務的cpu親和性中時設定的
         * 上面 detach_task() 會一直迴圈直到 env.imbalance<=0,否則就是有任務不能被遷移到dst cpu。
         *
         * 如果sd仍然未達均衡狀態,並且在之前的均衡過程中,有因為affinity的原因導致任務無法遷移到dst cpu,
         * 這時候要繼續在src rq上搜索任務,遷移到備選的dst cpu,因此,這裡再次發起均衡操作。這裡的均衡上
         * 下文的dst cpu改為備選的cpu,loop也被清零,重新開始掃描。
         */
        if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {

            /* Prevent to re-select dst_cpu via env's CPUs */
            /*
             * 將dst cpu從 env.cpus 中清除,避免重新被選中為dst cpu,這個被踢出去的dst cpu不會再參與接下來
             * 有affinity限制任務的均衡了。
             */
            __cpumask_clear_cpu(env.dst_cpu, env.cpus);
            /*
             * env.new_dst_cpu是在detach_task-->can_migrate_task()中判斷賦值的,並用LBF_DST_PINNED表識有
             * 可用new_dst_cpu,MC層級中只有dst cpu就不會賦值,只有DIE層級可能會賦值。
             */
            env.dst_rq     = cpu_rq(env.new_dst_cpu);
            env.dst_cpu     = env.new_dst_cpu;
            env.flags    &= ~LBF_DST_PINNED;
            env.loop     = 0;
            env.loop_break     = sched_nr_migrate_break;

            /*
             * Go back to "more_balance" rather than "redo" since we
             * need to continue with same src_cpu.
             */
            goto more_balance;
        }

        /*
         * We failed to reach balance because of affinity.
         */
        //若還是上次sd層級存在,說明本輪是MC層級的balance
        if (sd_parent) {
            //指向DIE層級
            int *group_imbalance = &sd_parent->groups->sgc->imbalance;
            /*
             * 如果本層級(MC層級)的sd以為affinity而無法達到均衡狀態,需要把這個標誌標記到上層sd->sg中,以便
             * 在上層sd均衡的時候會判斷該sg為imablanced,從而有更大的機會被選中為busiest group,從而解決sd的均
             * 衡問題。
             */
            if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
                *group_imbalance = 1;
        }

        /* All tasks on this runqueue were pinned by CPU affinity */
        /*
         * 如果選中的busiest cpu的所有task都是通過affinity鎖定在該cpu上,那麼清除該cpu,以便下輪均衡不再考慮
         * 該cpu。這種情況下需要搜尋新的src cpu,因此跳轉到redo
        */
        if (unlikely(env.flags & LBF_ALL_PINNED)) {
            __cpumask_clear_cpu(cpu_of(busiest), cpus);
            /*
             * Attempting to continue load balancing at the current
             * sched_domain level only makes sense if there are
             * active CPUs remaining as possible busiest CPUs to
             * pull load from which are not contained within the
             * destination group that is receiving any migrated
             * load.
             */
            //MC層級恆返回0,跳轉; DIE層級此時需要參與均衡的cpu有與dst cpu不是處於同一cluster才會繼續均衡。
            if (!cpumask_subset(cpus, env.dst_grpmask)) {
                env.loop = 0;
                env.loop_break = sched_nr_migrate_break;
                goto redo;
            }
            goto out_all_pinned;
        }
    }

    /*
     * 至此,src rq上cfs任務連結串列已經被遍歷(也可能被遍歷多次),基本上對runnable任務的掃描已經到位了,如果還
     * 不行就只能考慮running task了,程式碼如下:
     */
    if (!ld_moved) {
        schedstat_inc(sd->lb_failed[idle]);
        /*
         * Increment the failure counter only on periodic balance.
         * We do not want newidle balance, which can be very
         * frequent, pollute the failure counter causing
         * excessive cache_hot migrations and active balances.
         */
        /*
         * 經過上面一系列的操作但沒有完成任何任務遷移,那麼就累加均衡失敗的計數,此計數會導致後續更激進的均衡,
         * 比如遷移cache hot任務、啟動active balance。
         * 這裡過濾掉new idle banlance只統計週期banlance的,因為new idle balnace次數太多,累計其失敗次數會導致
         * nr_balance_failed 過大,很容易觸發更激進的均衡。
         */
        if (idle != CPU_NEWLY_IDLE)
            sd->nr_balance_failed++;

        /*
         * 判斷是否需要啟動active balance,就是判斷是否需要將src cpu當前正在running的任務遷移到dst cpu,因為前面一番
         * 折騰後發現無法遷移runnable的任務,那麼就再考慮一下running的任務
         */
        if (need_active_balance(&env)) {
            unsigned long flags;

            raw_spin_lock_irqsave(&busiest->lock, flags);

            /*
             * Don't kick the active_load_balance_cpu_stop,
             * if the curr task on busiest CPU can't be moved to this_cpu:
             */
            //嘗試遷移前先判斷一下src cpu上當前running的任務是否由於親和性不能遷移到dst cpu.
            if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
                raw_spin_unlock_irqrestore(&busiest->lock, flags);
                env.flags |= LBF_ALL_PINNED;
                goto out_one_pinned;
            }

            /*
             * ->active_balance synchronizes accesses to
             * ->active_balance_work.  Once set, it's cleared
             * only after active load balance is finished.
             */
            //在busiest rq上設定active_balance標記
            if (!busiest->active_balance) {
                busiest->active_balance = 1;
                busiest->push_cpu = this_cpu;
                active_balance = 1;
            }
            raw_spin_unlock_irqrestore(&busiest->lock, flags);

            if (active_balance) {
                /*
                 * 就是向 busiest cpu 的stop排程類的 "migration/X" 執行緒queue一個work,然後喚醒它,執行流程為
                 * per-cpu cpu_stopper.thread --> smpboot_thread_fn --> cpu_stopper_thread --> fn(arg) --> active_load_balance_cpu_stop(busiest rq)
                 */
                stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work);
            }

            /* We've kicked active balancing, force task migration. */
            sd->nr_balance_failed = sd->cache_nice_tries+1; //TODO: 什麼作用?
        }
    } else {
        //至少完成了一個任務的遷移,重置均衡失敗的計數
        sd->nr_balance_failed = 0;
    }

    if (likely(!active_balance) || voluntary_active_balance(&env)) {
        /* We were unbalanced, so reset the balancing interval */
        sd->balance_interval = sd->min_interval;
    } else {
        /*
         * If we've begun active balancing, start to back off. This
         * case may not be covered by the all_pinned logic if there
         * is only 1 task on the busy runqueue (because we don't call
         * detach_tasks).
         */
        if (sd->balance_interval < sd->max_interval)
            sd->balance_interval *= 2; //TODO: balance_interval 的具體作用?
    }

    goto out;

//判斷不適合均衡,沒有找到最忙的rq都會跳轉到這裡
out_balanced:
    /*
     * We reach balance although we may have faced some affinity
     * constraints. Clear the imbalance flag only if other tasks got
     * a chance to move and fix the imbalance.
     * 翻譯:儘管我們可能面臨一些親和力限制,但我們達到了平衡。 僅當其他任務有機會
     * 移動並修復不平衡時才清除不平衡標誌。
     *
     * 只有此次均衡sd是MC層級的,sd_parent才存在。跳轉到這裡時 LBF_ALL_PINNED還沒有
     * 機會被賦值上呢
     */
    if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
        int *group_imbalance = &sd_parent->groups->sgc->imbalance;
        //這裡MC層級的均衡,只要不是all pinned,又將其清除了
        if (*group_imbalance)
            *group_imbalance = 0;
    }

//在判斷busiest cpu上由於親和性沒有一個任務可以遷移到dst cpu上時就跳到這裡:
out_all_pinned:
    /*
     * We reach balance because all tasks are pinned at this level so
     * we can't migrate them. Let the imbalance flag set so parent level
     * can try to migrate them.
     */
    schedstat_inc(sd->lb_balanced[idle]);

    sd->nr_balance_failed = 0;

//最後的active balance發現src cpu上running的任務由於親和性也不能遷移到dst cpu上就跳轉到這裡
out_one_pinned:
    ld_moved = 0;

    /*
     * newidle_balance() disregards balance intervals, so we could
     * repeatedly reach this code, which would lead to balance_interval
     * skyrocketting in a short amount of time. Skip the balance_interval
     * increase logic to avoid that.
     * 翻譯:newidle_balance() 忽略平衡間隔,所以我們可以重複到達這段程式碼,########
     * 這會導致 balance_interval 在短時間內暴漲。 跳過 new idle balance的
     * balance_interval 的增加邏輯以避免這種情況。
     */
    if (env.idle == CPU_NEWLY_IDLE)
        goto out;

    /* tune up the balancing interval */
    if ((env.flags & LBF_ALL_PINNED && sd->balance_interval < MAX_PINNED_INTERVAL) || sd->balance_interval < sd->max_interval)
        sd->balance_interval *= 2;
out:
    return ld_moved;
}

四、判斷是否應該執行均衡操作——should_we_balance()

/*0:不應該,1:應該*/
static int should_we_balance(struct lb_env *env)
{
    struct sched_group *sg = env->sd->groups;
    int cpu;

    /*
     * Ensure the balancing environment is consistent; can happen
     * when the softirq triggers 'during' hotplug.
     */
    //cpus是初始化為dst cpu的cluster(MC)或所有的cluster(DIE)
    if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
        return 0;

    /*
     * In the newly idle case, we will allow all the CPUs
     * to do the newly idle load balance.
     */
    //new idle型別的balance會被判定恆需要balance的
    if (env->idle == CPU_NEWLY_IDLE)
        return 1;

    /* Try to find first idle CPU */
    /* MC: 只有發起均衡的一個cpu, DIE: 是發起均衡的cpu所在cluster的所有cpu */
    for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
        if (!idle_cpu(cpu))
            continue;

        /* Are we the first idle CPU? */
        /*
         * 找到第一個idle cpu,若是發起均衡的cpu,就判斷為需要均衡,否則表示此group
         * 中還有其它idle cpu, 就判斷為不需要均衡。
         */
        return cpu == env->dst_cpu;
    }

    /* Are we the first CPU of this group ? */
    /* 如果發起均衡的cpu所在的cluster沒有idle cpu, 就判斷 sg->sgc->cpumask 中的第一
     * 個cpu是否是發起均衡的cpu,對於MC層級, sg->sgc->cpumask 中只有發起均衡的cpu自
     * 己,所以都能返回需要均衡,若是DIE層級的話,只有發起均衡的cpu是cluster中的第一
     * 個cpu才返回需要均衡。
     *
     * 資料:在non-base domain,每個group有多個cpu,如果每一個cpu都可以進行均衡,那麼
     * 均衡就太密集了,白白消耗CPU資源,所以限制只有第一個idle的cpu可以發起均衡,如果
     * 沒有idle的CPU,那麼限制group中的第一個CPU可以發起均衡。
     */
    return group_balance_cpu(sg) == env->dst_cpu; //返回sg->sgc->cpumask中的第一個cpu
}

五、查詢最繁忙的sg——find_busiest_group()

作用是如果存在 imbalance,就返回此sd中最忙的sg。同時也會計算為了達到均衡需要移動多少runnable load。

/******* find_busiest_group() helpers end here *********************/

/*
 * Decision matrix according to the local and busiest group type:
 *
 * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded ################
 * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced
 * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced
 * misfit_task      force     N/A        N/A    N/A  force      force
 * asym_packing     force     force      N/A    N/A  force      force
 * imbalanced       force     force      N/A    N/A  force      force
 * overloaded       force     force      N/A    N/A  force      avg_load
 *
 * N/A :      Not Applicable because already filtered while updating
 *            statistics.
 * balanced : The system is balanced for these 2 groups.
 * force :    Calculate the imbalance as load migration is probably needed.
 * avg_load : Only if imbalance is significant enough.
 * nr_idle :  dst_cpu is not busy and the number of idle CPUs is quite
 *            different in groups.
 */

/**
 * find_busiest_group - Returns the busiest group within the sched_domain
 * if there is an imbalance.
 *
 * Also calculates the amount of runnable load which should be moved
 * to restore balance.
 *
 * @env: The load balancing environment.
 *
 * Return:    - The busiest group if imbalance exists.
 */
static struct sched_group *find_busiest_group(struct lb_env *env)
{
    struct sg_lb_stats *local, *busiest;
    struct sd_lb_stats sds;

    init_sd_lb_stats(&sds);

    /*
     * Compute the various statistics relevant for load balancing at
     * this level.
     */
    /*
     * 負載資訊都是不斷的在變化,在尋找最繁忙group的時候,我們首先要更新sd負載均衡資訊,
     * 以便可以根據最新的負載情況來搜尋。
     * 此函式會更新該 sd 上各個 sg 的負載和算力,得到local group以及
     * 非local group最忙的那個group的均衡資訊,以便後續給出最適合的均衡決策。
     */
    update_sd_lb_stats(env, &sds);

    /*
     * 在系統沒有進入 overutilized 狀態之前,EAS起作用。如果EAS起作用,那麼負載可能是不均衡的(考慮功耗),
     * 因此,這時候不進行負載均衡,依賴task placement的結果。
     */
    if (sched_energy_enabled()) {
        struct root_domain *rd = env->dst_rq->rd;
        int out_balance = 1;

        trace_android_rvh_find_busiest_group(sds.busiest, env->dst_rq, &out_balance); //
        /*
         * 在系統沒有進入 overutilized 狀態之前,EAS起作用。如果EAS起作用,那麼負載可能是不均衡
         * 的(考慮功耗),因此,這時候不進行負載均衡(goto out_balanced),依賴task placement的結果。
         *
         * out_balance:還有一個hook可以決定是否使能EAS的情況下就算是沒 overutilized 也進行均衡。
         */
        if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized) && out_balance)
            goto out_balanced;
    }

    /*找出 busiest sg 還要與 local sg 進行PK */
    local = &sds.local_stat;
    busiest = &sds.busiest_stat;

    /* There is no busy sibling group to pull tasks from */
    /*
     * 如果沒有找到最忙的那個group,說明當前sd中,其他的非local的最繁忙的
     * group(後文稱之busiest group)沒有可以拉取到local group的任務,不需要均衡處理。
     */
    if (!sds.busiest)
        goto out_balanced;

    /* Misfit tasks should be dealt with regardless of the avg load */
    /*Busiest group 中有 misfit task,那麼必須要進行均衡,把 misfit task拉取到local group中*/
    if (busiest->group_type == group_misfit_task)
        goto force_balance;

    /* ASYM feature bypasses nice load balance check */
    if (busiest->group_type == group_asym_packing)
        goto force_balance;

    /*
     * If the busiest group is imbalanced the below checks don't
     * work because they assume all things are equal, which typically
     * isn't true due to cpus_ptr constraints and the like.
     */
    /* busiest group是一個由於cpu affinity導致的不均衡,MC層級均衡時發現均衡不了設定的 */
    if (busiest->group_type == group_imbalanced)
        goto force_balance;

    /*
     * If the local group is busier than the selected busiest group don't try and pull any tasks.
     */
    /*
     * 如果local group比busiest group還要忙,那麼不需要進行均衡(目前的均衡只能從其他group拉
     * 任務到local group)
     */
    if (local->group_type > busiest->group_type)
        goto out_balanced;

    /*
     * When groups are overloaded, use the avg_load to ensure fairness between tasks.
     */
    /*如果local group處於overloaded狀態,那麼需要通過avg_load的比拼來做均衡決策*/
    if (local->group_type == group_overloaded) {
        /*
         * If the local group is more loaded than the selected
         * busiest group don't try to pull any tasks.
         */
        /*如果local group的平均負載比busiest group還要高,那麼不需要進行均衡*/
        if (local->avg_load >= busiest->avg_load)
            goto out_balanced;

        /* XXX broken for overlapping NUMA groups */
        sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) / sds.total_capacity;

        /*
         * Don't pull any tasks if this group is already above the
         * domain average load.
         */
        /*如果local group的平均負載高於sd的平均負載,那麼也不需要進行均衡*/
        if (local->avg_load >= sds.avg_load)
            goto out_balanced;

        /*
         * If the busiest group is more loaded, use imbalance_pct to be
         * conservative.
         */
        /*
         * 雖然busiest group的平均負載高於local group,但是高的不多,那也不需要進行均衡,
         * 畢竟均衡需要額外的開銷。具體的門限是有sd的 imbalance_pct 確定的。
         *
         * 預設 busiest->avg_load <= 1.17 * local->avg_load 就不均衡。
         */
        if (100 * busiest->avg_load <= env->sd->imbalance_pct * local->avg_load)
            goto out_balanced;
    }

    /* Try to move all excess tasks to child's sibling domain*/
    if (sds.prefer_sibling && local->group_type == group_has_spare && busiest->sum_nr_running > local->sum_nr_running + 1)
        goto force_balance;

    /*
     * 非 group_overloaded 不看平均負載,主要看idle cpu的情況。
     * 這裡處理busiest group沒有overload的場景,這時候說明該 sd 中其他的group的
     * 算力都是cover當前的任務負載,是否要進行均衡,主要看idle cpu的情況。
     */
    if (busiest->group_type != group_overloaded) {
        /*
         * 反正busiest group當前算力能處理其rq上的任務,那麼在本CPU繁忙的情況下沒有必要進行均衡,
         * 因為這時候關注的是idle cpu,即讓更多的idle cpu參與運算,因此,如果本CPU不是idle cpu,
         * 那麼判斷sd處於均衡狀態。
         */
        if (env->idle == CPU_NOT_IDLE)
            /*
             * If the busiest group is not overloaded (and as a
             * result the local one too) but this CPU is already
             * busy, let another idle CPU try to pull task.
             */
            goto out_balanced;

        /* 如果busiest group中的cpu和local group中的差不多或更多idle CPU,那麼也沒有必要進行均衡*/
        if (busiest->group_weight > 1 && local->idle_cpus <= (busiest->idle_cpus + 1))
            /*
             * If the busiest group is not overloaded
             * and there is no imbalance between this and busiest
             * group wrt idle CPUs, it is balanced. The imbalance
             * becomes significant if the diff is greater than 1
             * otherwise we might end up to just move the imbalance
             * on another group. Of course this applies only if
             * there is more than 1 CPU per group.
             */
            goto out_balanced;

        /*如果busiest group中只有一個正在執行的cfs任務,那麼也沒有必要進行均衡*/
        if (busiest->sum_h_nr_running == 1)
            /* busiest doesn't have any tasks waiting to run */
            goto out_balanced;
    }

force_balance:
    /* Looks like there is an imbalance. Compute it */
    /* 此函式用來計算sd中不均衡程度 */
    calculate_imbalance(env, &sds);

    return env->imbalance ? sds.busiest : NULL;

out_balanced:
    env->imbalance = 0;
    return NULL;
}

預設情況下若判斷 rd 沒有 overutilized 是不進行負載均衡的,但是有個hook,vendor可以更改此邏輯。

1. init_sd_lb_stats

static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
{
    /*
     * Skimp on the clearing to avoid duplicate work. We can avoid clearing
     * local_stat because update_sg_lb_stats() does a full clear/assignment.
     * We must however set busiest_stat::group_type and
     * busiest_stat::idle_cpus to the worst busiest group because
     * update_sd_pick_busiest() reads these before assignment.
     */
    *sds = (struct sd_lb_stats){
        .busiest = NULL,
        .local = NULL,
        .total_load = 0UL,
        .total_capacity = 0UL,
        .busiest_stat = {
            .idle_cpus = UINT_MAX,
            .group_type = group_has_spare,
        },
    };
}

2. update_sd_lb_stats()

更新 sg 的算力。在base domain(MC domain)上,我們會更新發起均衡所在CPU的算力。注意:這裡說的CPU算力指的是該CPU可以用於cfs任務的算力,即需要去掉由於thermalpressure而損失的和去掉RT/DL/IRQ消耗的算力。具體請參考 update_cpu_capacity 函式。在其他non-base domain(DIE domain)上,我們需要對本地 sg(發起均衡的CPU所在的group)進行算力更新。這個比較簡單,就是把child domain(即MC domain)的所有 sg 的算力加起來。更新後的算力儲存在 sg 中的 sgc 成員中。

此函式前半段主要是遍歷該 sd 所有的group,對其負載統計進行更新。更新完負載之後會選定兩個 sg:其一是local group,另外一個是最繁忙的non-local group,然後進行進一步PK。

/**
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 * @env: The load balancing environment.
 * @sds: variable to hold the statistics for this sched_domain.
 */
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
    struct sched_domain *child = env->sd->child;
    struct sched_group *sg = env->sd->groups;
    struct sg_lb_stats *local = &sds->local_stat;
    struct sg_lb_stats tmp_sgs;
    int sg_status = 0;

#ifdef CONFIG_NO_HZ_COMMON
    if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) //TODO: 看什麼時候更新的?
        env->flags |= LBF_NOHZ_STATS;
#endif

    do {
        struct sg_lb_stats *sgs = &tmp_sgs;
        int local_group;

        //MC層級只有1個cpu,DIE層級是一個cluster的cpu
        local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
        /*
         * 更新算力沒有必要更新的太頻繁,這裡做了兩個限制:
         * 1.只有 local group 才進行算力更新,
         * 2.對於new idle型別的balance通過時間間隔來減少頻繁的更新算力,這個時間間隔來自balance_interval:
         *        jiffies + msecs_to_jiffies(sd->balance_interval)。
         * 3.其它型別的idle可以更新算力
         */
        if (local_group) {
            sds->local = sg;
            sgs = local;

            if (env->idle != CPU_NEWLY_IDLE || time_after_eq(jiffies, sg->sgc->next_update))
                //更新sd->sg->sgc裡面的相關capacity成員,DIE層級的MC裡面的也一併更新
                update_group_capacity(env->sd, env->dst_cpu);
        }

        /*上面是更新算力,這裡是更新該sched group的負載統計*/
        update_sg_lb_stats(env, sg, sgs, &sg_status);

        /*
         * 在sched domain的各個group遍歷中,我們需要兩個group資訊,一個是local group,另外一個就是
         * non local group中的最忙的那個group。顯然,如果是local group,不需要下面的比拼最忙的過程。
         */
        if (local_group)
            goto next_group;

        //對於non local group的sg,和之前找到最忙的那個group進行PK,更忙的選中為busiest sg
        if (update_sd_pick_busiest(env, sds, sg, sgs)) {
            sds->busiest = sg;
            sds->busiest_stat = *sgs;
        }

next_group:
        /* Now, start updating sd_lb_stats */
        /* 累計各個sg的負載和算力到sds */
        sds->total_load += sgs->group_load;
        sds->total_capacity += sgs->group_capacity;

        //MC層級就是在本cluster的各cpu之間遍歷,DIE層級是在各個cluster之間遍歷
        sg = sg->next;
    } while (sg != env->sd->groups); //發起均衡的cpu所在的group就是最先遍歷的sg


    /* Tag domain that child domain prefers tasks go to siblings first */
    sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; //這是什麼操作?

#ifdef CONFIG_NO_HZ_COMMON
    if ((env->flags & LBF_NOHZ_AGAIN) && cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
        WRITE_ONCE(nohz.next_blocked, jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
    }
#endif

    if (env->sd->flags & SD_NUMA) //無此flag,不執行
        env->fbq_type = fbq_classify_group(&sds->busiest_stat);

    /*
     * 更新root domain的overload和overutil狀態。對於頂層的sd,我們需要把各個sg的overload和
     * overutil狀態體現到root domain中。
     */
    if (!env->sd->parent) { 
        //DIE層級的sd,rd是全域性唯一的
        struct root_domain *rd = env->dst_rq->rd;

        /* update overload indicator if we are at root domain */
        WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);

        /* Update over-utilization (tipping point, U >= 0) indicator */
        WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
        trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);

    } else if (sg_status & SG_OVERUTILIZED) {
        //MC層級的sd,就只將overutilized標記到rd
        struct root_domain *rd = env->dst_rq->rd;

        WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
        trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
    }
}

2.1. update_group_capacity()

更新一個 sg 的算力:

void update_group_capacity(struct sched_domain *sd, int cpu)
{
    struct sched_domain *child = sd->child;
    struct sched_group *group, *sdg = sd->groups;
    unsigned long capacity, min_capacity, max_capacity;
    unsigned long interval;

    interval = msecs_to_jiffies(sd->balance_interval);
    interval = clamp(interval, 1UL, max_load_balance_interval);
    sdg->sgc->next_update = jiffies + interval;

    //MC層級均衡時傳參,只更新MC層級的即可。
    if (!child) {
        update_cpu_capacity(sd, cpu);
        return;
    }

    /*下面是DIE層級傳參時的update*/
    capacity = 0;
    min_capacity = ULONG_MAX;
    max_capacity = 0;

    //MC和DIE都沒有這個標誌
    if (child->flags & SD_OVERLAP) {
        /*
         * SD_OVERLAP domains cannot assume that child groups
         * span the current group.
         */
        for_each_cpu(cpu, sched_group_span(sdg)) {
            unsigned long cpu_cap = capacity_of(cpu);

            capacity += cpu_cap;
            min_capacity = min(cpu_cap, min_capacity);
            max_capacity = max(cpu_cap, max_capacity);
        }
    } else  {
        /*
         * !SD_OVERLAP domains can assume that child groups span the current group.
         */

        group = child->groups;
        do {
            struct sched_group_capacity *sgc = group->sgc;

            capacity += sgc->capacity;
            min_capacity = min(sgc->min_capacity, min_capacity);
            max_capacity = max(sgc->max_capacity, max_capacity);
            //cluster內的各cpu之間遍歷
            group = group->next;
        } while (group != child->groups);
    }

    sdg->sgc->capacity = capacity; //本cluster所有cpu的可用於cfs任務的算力之和
    sdg->sgc->min_capacity = min_capacity; //本cluster單個cpu的可用於cfs任務最小的
    sdg->sgc->max_capacity = max_capacity; //本cluster單個cpu的可用於cfs任務最大的
}

2.1.1 更新 sd->sg->sgc

static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
    //計算除去rt/dl/irq佔用的算力和thermal pressure後還剩餘的算力
    unsigned long capacity = scale_rt_capacity(cpu);
    struct sched_group *sdg = sd->groups;

    //update_cpu_capacity: return per_cpu(cpu_scale, cpu) 即cat cpu_capacity
    cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);

    if (!capacity)
        capacity = 1;

    trace_android_rvh_update_cpu_capacity(cpu, &capacity);
    cpu_rq(cpu)->cpu_capacity = capacity;
    trace_sched_cpu_capacity_tp(cpu_rq(cpu));

    //原生是三者賦一樣的值,MC層級是一樣的值,DIE層級的外層函式又會覆蓋賦值
    sdg->sgc->capacity = capacity;
    sdg->sgc->min_capacity = capacity;
    sdg->sgc->max_capacity = capacity;
}

2.2. update_sg_lb_stats()

更新 sg 的負載:

/**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
 * @env: The load balancing environment.
 * @group: sched_group whose statistics are to be updated.
 * @sgs: variable to hold the statistics for this group.
 * @sg_status: Holds flag indicating the status of the sched_group
 */
static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, struct sg_lb_stats *sgs, int *sg_status)
{
    int i, nr_running, local_group;

    memset(sgs, 0, sizeof(*sgs));

    //MC層級只是cpu自己,DIE層級有一個cluster的cpu
    local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));

    for_each_cpu_and(i, sched_group_span(group), env->cpus) {
        struct rq *rq = cpu_rq(i);

        if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
            env->flags |= LBF_NOHZ_AGAIN;

        /*
         * sched group負載有三種,load、runnable load、util。把所有cpu上load、runnable load、util
         * 累計起來就是sched group的負載。除了PELT跟蹤的load avg資訊,還統計了sched group中的cfs任
         * 務和總任務數量。
         */
        sgs->group_load += cpu_load(rq); //rq->cfs_rq.avg.load_avg
        sgs->group_util += cpu_util(i); //max(rq->cfs_rq.avg.util_avg, rq->cfs_rq.avg.util_est.enqueued) #########三個都標紅
        sgs->group_runnable += cpu_runnable(rq); //rq->cfs_rq.avg.runnable_avg
        sgs->sum_h_nr_running += rq->cfs.h_nr_running;

        /*
         * cfs_rq->nr_runing 記錄cfs_rq上所有排程實體個數,不包含子就緒佇列。cfs_rq->h_nr_running記錄
         * cfs_rq上所有排程實體的個數,包含 group se 對應 group cfs_rq 上的排程實體。
         * 但這裡是 rq->nr_running,還包含rt、dl的。
         */
        nr_running = rq->nr_running;
        sgs->sum_nr_running += nr_running;

        /*只要該 sg 上有一個CPU上有2個及以上的任務,那麼就標記該sched group為overload狀態。*/
        if (nr_running > 1)
            *sg_status |= SG_OVERLOAD;

        /*
         * 只要該 sg 上有一個CPU處於overutilized(原生util使用佔比大於cpu當前算力的80%),那
         * 麼就標記該sg 為overutilized狀態。
         */
        if (cpu_overutilized(i))
            *sg_status |= SG_OVERUTILIZED;

#ifdef CONFIG_NUMA_BALANCING
        sgs->nr_numa_running += rq->nr_numa_running;
        sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
        /*
         * No need to call idle_cpu() if nr_running is not 0
         */
        /*統計該sched group中的idle cpu的個數*/
        if (!nr_running && idle_cpu(i)) {
            sgs->idle_cpus++;
            /* Idle cpu can't have misfit task */
            continue;
        }

        /*
         * 當sd包括了算力不同的CPU(DIE),那麼即便cpu上只有一個任務,但是如果該任務是misfit task那麼
         * 也標記sched group為overload狀態,並記錄sched group中最大的 misfit task load。需要注意的是:
         * idle cpu不需要檢測misfit task,此外,對於local group,也沒有必要檢測 misfit task,因為同一
         * 個cluster,算力相同,不可能拉取misfit task到本cpu上。
         */
        if (local_group)
            continue;

        /* Check for a misfit task on the cpu */
        //只有DIE層級有這個標誌,rq->misfit_task_load 是對rq上正在執行的任務的描述
        if (env->sd->flags & SD_ASYM_CPUCAPACITY && sgs->group_misfit_task_load < rq->misfit_task_load) {
            sgs->group_misfit_task_load = rq->misfit_task_load;
            *sg_status |= SG_OVERLOAD;
        }
    }

    /* Check if dst CPU is idle and preferred to this group */
    //MC和DIE都沒有指定 SD_ASYM_PACKING 標誌,不執行
    if (env->sd->flags & SD_ASYM_PACKING && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
            sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
        sgs->group_asym_packing = 1;
    }

    //這兩行是更新sg的總算力和cpu個數。再次強調一下,這裡的capacity是指cpu可以用於cfs任務的算力。
    sgs->group_capacity = group->sgc->capacity;
    sgs->group_weight = group->group_weight;

    //判斷sg是否超載以及超載的型別,sd_init: MC和DIE的imbalance_pct都初始化為117
    sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);

    /* Computing avg_load makes sense only when group is overloaded */
    /*
     * 計算sg的平均負載(僅在group overloaded狀態才計算)。在overload的情況下,
     * 通過sg平均負載可以識別更繁忙的group。因為不同cluster算力不同,avg_load不同。
     */
    if (sgs->group_type == group_overloaded)
        sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / sgs->group_capacity;
}

2.3. update_sd_pick_busiest()

當前遍歷的 sg 和之前選出的 busiest sg 進行PK,誰更忙誰被選中為 busiest sg,設定到 sds->busiest 中。

/**
 * update_sd_pick_busiest - return 1 on busiest group
 * @env: The load balancing environment.
 * @sds: sched_domain statistics
 * @sg: sched_group candidate to be checked for being the busiest
 * @sgs: sched_group statistics
 *
 * Determine if @sg is a busier group than the previously selected busiest group.
 *
 * Return: %true if @sg is a busier group than the previously selected busiest group. %false otherwise.
 */
static bool update_sd_pick_busiest(struct lb_env *env, struct sd_lb_stats *sds,
    struct sched_group *sg, struct sg_lb_stats *sgs)
{
    struct sg_lb_stats *busiest = &sds->busiest_stat;

    /* Make sure that there is at least one task to pull */
    if (!sgs->sum_h_nr_running)
        return false;

    /*
     * Don't try to pull misfit tasks we can't help.
     * We can use max_capacity here as reduction in capacity on some
     * CPUs in the group should either be possible to resolve
     * internally or be covered by avg_load imbalance (eventually).
     */
    if (sgs->group_type == group_misfit_task &&
            (!group_smaller_max_cpu_capacity(sg, sds->local) || sds->local_stat.group_type != group_has_spare))
        return false;

    //sgs代表的sg的負載更重
    if (sgs->group_type > busiest->group_type)
        return true;

    if (sgs->group_type < busiest->group_type)
        return false;

    /*
     * The candidate and the current busiest group are the same type of
     * group. Let check which one is the busiest according to the type.
     */
    /* 下面就是兩個sg的group_type相等,一樣重的情況了。不同type進一步判斷誰更忙的方法不同 */
    switch (sgs->group_type) {
    case group_overloaded:
        /* Select the overloaded group with highest avg_load. */
        /* 負載最重的一種狀態是進一步去PK avg_load,哪個組的當前算力小,哪個組更忙 */
        if (sgs->avg_load <= busiest->avg_load)
            return false;
        break;

    case group_imbalanced:
        /*
         * Select the 1st imbalanced group as we don't have any way to
         * choose one more than another.
         * 次忙的 group_imbalanced 單純的選第一個
         */
        return false;

    case group_asym_packing:
        /* 
         * Prefer to move from lowest priority CPU's work 
         * 第三忙的,引數1的cpu id小於引數2的為真
         */
        if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
            return false;
        break;

    case group_misfit_task:
        /*
         * If we have more than one misfit sg go with the biggest misfit.
         * 第四忙的,進一步PK正在執行的任務的util,大的更忙
         */
        if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
            return false;
        break;

    case group_fully_busy:
        /*
         * XXX for now avg_load is not computed and always 0 so we select the 1st one.
         * 選擇 avg_load 最高的fully busy group。 理論上,沒有必要從這種組中拉出任務,
         * 因為任務擁有它們需要的所有計算能力,但我們仍然可以通過減少訪問共享硬體資源時
         * 的爭用來提高整體吞吐量。
         * XXX 現在 avg_load 不計算並且總是 0 所以我們選擇第一個。
         *
         * 也是PK誰的 avg_load 大誰更忙
         */
        if (sgs->avg_load <= busiest->avg_load)
            return false;
        break;

    case group_has_spare:
        /*
         * Select not overloaded group with lowest number of idle cpus
         * and highest number of running tasks. We could also compare
         * the spare capacity which is more stable but it can end up
         * that the group has less spare capacity but finally more idle
         * CPUs which means less opportunity to pull tasks.
         *
         * 哪個sg的idle cpu個數少,哪個相對忙一些,若idle cpu個數相同,哪
         * 個sg中running的任務多,哪個相對忙一些。
         */
        if (sgs->idle_cpus > busiest->idle_cpus)
            return false;
        else if ((sgs->idle_cpus == busiest->idle_cpus) && (sgs->sum_nr_running <= busiest->sum_nr_running))
            return false;

        break;
    }

    /*
     * Candidate sg has no more than one task per CPU and has higher
     * per-CPU capacity. Migrating tasks to less capable CPUs may harm
     * throughput. Maximize throughput, power/energy consequences are not
     * considered.
     */
    //只對於DIE層級有效
    if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && (sgs->group_type <= group_fully_busy) && 
            (group_smaller_min_cpu_capacity(sds->local, sg)))
        return false;

    return true;
}

#define fits_capacity(cap, max)    ((cap) * 1280 < (max) * 1024)

/*
 * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
 * per-CPU capacity than sched_group ref.
 */
static inline bool group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
    return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
}

六、在最忙的組中查詢最繁忙的cpu——find_busiest_group()

find_busiest_queue 函式用來尋找 busiest group 中最繁忙的cpu。和 buiest group 在上面判斷的 migrate type 相關,不同的type使用不同的方法來尋找busiest cpu:
migrate_load: 最忙cpu是 cpu load/cpu capacity 最大的那個cpu
migrate_util: 最忙cpu是util最大的那個cpu
migrate_task: 最忙cpu是任務最多的那個cpu
migrate_misfit: 最忙cpu是 misfit task load 最重的那個cpu.

一旦找到最忙的CPU,那麼任務遷移的目標和源頭都確定了,後續就可以通過detach tasks和attach tasks進行任務遷移了。

/*
 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
 */
static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group)
{
    struct rq *busiest = NULL, *rq;
    unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
    unsigned int busiest_nr = 0;
    int i, done = 0;

    trace_android_rvh_find_busiest_queue(env->dst_cpu, group, env->cpus, &busiest, &done);
    if (done)
        return busiest;

    for_each_cpu_and(i, sched_group_span(group), env->cpus) {
        unsigned long capacity, load, util;
        unsigned int nr_running;
        enum fbq_type rt;

        rq = cpu_rq(i);
        rt = fbq_classify_rq(rq); //直接return regular=0

        /*
         * We classify groups/runqueues into three groups:
         *  - regular: there are !numa tasks
         *  - remote:  there are numa tasks that run on the 'wrong' node
         *  - all:     there is no distinction
         *
         * In order to avoid migrating ideally placed numa tasks,
         * ignore those when there's better options.
         *
         * If we ignore the actual busiest queue to migrate another
         * task, the next balance pass can still reduce the busiest
         * queue by moving tasks around inside the node.
         *
         * If we cannot move enough load due to this classification
         * the next pass will adjust the group classification and
         * allow migration of more tasks.
         *
         * Both cases only affect the total convergence complexity.
         */
        if (rt > env->fbq_type)
            continue;

        capacity = capacity_of(i); //cpu當前算力
        nr_running = rq->cfs.h_nr_running;

        /*
         * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
         * eventually lead to active_balancing high->low capacity.
         * Higher per-CPU capacity is considered better than balancing
         * average load.
         */
        //若是DIE層級的均衡,且dst cpu的算力小於最忙組中cpu的算力且這個最忙組中的cpu只有一個正在執行的任務,就跳過
        if (env->sd->flags & SD_ASYM_CPUCAPACITY && capacity_of(env->dst_cpu) < capacity && nr_running == 1)
            continue;

        switch (env->migration_type) {
        case migrate_load:
            /*
             * When comparing with load imbalance, use cpu_load() which is not scaled with the CPU capacity.
             */
            load = cpu_load(rq); //return cfs_rq->avg.load_avg;

            //此cpu中只有一個任務且負載大於不均衡值且可用於cfs任務的算力充足
            if (nr_running == 1 && load > env->imbalance && !check_cpu_capacity(rq, env->sd))
                break;

            /*
             * For the load comparisons with the other CPUs, consider the cpu_load() scaled with the CPU
             * capacity, so that the load can be moved away from the CPU that is potentially running at a
             * lower capacity.
             * Thus we're looking for max(load_i / capacity_i), crosswise multiplication to rid ourselves of
             * the division works out to: load_i * capacity_j > load_j * capacity_i;
             * where j is our previous maximum.
             * 翻譯:
             * 對於與其他 CPU 的負載比較,請考慮隨 CPU 容量縮放的 cpu_load(),以便可以將負載從可能以較低算力
             * 執行的CPU上移開。
             * 因此,我們正在尋找 max(load_i / capacity_i),橫向乘法以擺脫除法的結果:load_i * capacity_j > load_j * capacity_i;
             * 其中 j 是我們之前的最大值。
             *
             * 判斷 load/capacity > busiest_load/busiest_capacity 來定最忙的cpu
             */
            if (load * busiest_capacity > busiest_load * capacity) {
                busiest_load = load;
                busiest_capacity = capacity;
                busiest = rq;
            }
            break;

        case migrate_util:
            util = cpu_util(cpu_of(rq));

            /*
             * Don't try to pull utilization from a CPU with one running task. Whatever its utilization, we will fail
             * detach the task.
             * 只有一個任務就交給active balance吧
             */
            if (nr_running <= 1)
                continue;

            //util最大的那個cpu最忙
            if (busiest_util < util) {
                busiest_util = util;
                busiest = rq;
            }
            break;

        case migrate_task:
            //runnable+running任務數最多的cpu最忙
            if (busiest_nr < nr_running) {
                busiest_nr = nr_running;
                busiest = rq;
            }
            break;

        case migrate_misfit:
            /*
             * For ASYM_CPUCAPACITY domains with misfit tasks we simply seek the "biggest" misfit task.
             */
            //misfit任務的load_avg最大的cpu最忙
            if (rq->misfit_task_load > busiest_load) {
                busiest_load = rq->misfit_task_load;
                busiest = rq;
            }

            break;

        }
    }

    return busiest;
}


static inline int check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
    /* rq->cpu_capacity <  rq->cpu_capacity_orig /sd->imbalance_pct * 100 */
    return ((rq->cpu_capacity * sd->imbalance_pct) < (rq->cpu_capacity_orig * 100));
}

七、detach_tasks——從busiest rq上摘取若干task

至此,我們已經確定了從busiest cpu的rq中搬移若干 load/util/task 到dst rq。不過無論是load還是util,最後還是要轉成任務。此函式用來從 busiest cpu 的rq中摘取適合的任務,並把這些任務掛入 lb_env->tasks 連結串列中。由於關中斷時長的問題,此函式也不會一次性把所有任務遷移到dest cpu上。

/*
 * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
 * busiest_rq, as part of a balancing operation within domain "sd".
 *
 * Returns number of detached tasks if successful and 0 otherwise.
 */
static int detach_tasks(struct lb_env *env)
{
    struct list_head *tasks = &env->src_rq->cfs_tasks;
    unsigned long util, load;
    struct task_struct *p;
    int detached = 0;

    lockdep_assert_held(&env->src_rq->lock);

    //已經均衡完畢了
    if (env->imbalance <= 0)
        return 0;

    /*
     * src rq的cfs_tasks連結串列就是該rq上的全部cfs任務,detach_tasks函式的主要邏輯就是遍歷這
     * 個cfs_tasks連結串列,找到最適合遷移到目標cpu rq的任務,並掛入 lb_env->tasks 連結串列。
     *
     * 為了達到均衡,一個任務可能會被多次掃描,也就是說tasks連結串列可能會被掃描多次!
     */
    while (!list_empty(tasks)) {
        /*
         * We don't want to steal all, otherwise we may be treated likewise,
         * which could at worst lead to a livelock crash.
         */
        /*
         * 在idle balance的時候,沒有必要把src上的唯一的task拉取到本cpu上,否則的話任務
         * 可能會在兩個CPU上來回拉扯。
         */
        if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
            break;

        /*
         * 從 src_rq->cfs_tasks 連結串列隊尾獲取一個任務(只是獲取,並沒有摘除)。這個連結串列的頭部
         * 是最近訪問的任務, 從尾部摘任務可以保證任務是cache cold的。上次不合適的已經move到
         * 這個連結串列頭了。
         */
        p = list_last_entry(tasks, struct task_struct, se.group_node);

        /*
         * 當把src rq上的任務都遍歷過之後,或者當達到迴圈上限,env->loop_max=min(sysctl_sched_nr_migrate,
         * busiest->nr_running)的時候退出迴圈,之後若判斷需要繼續搬移任務再重新進入這個函式,目的是使對src
         * cpu 關中斷的臨界區小一點
         */
        env->loop++;
        /* We've more or less seen every task there is, call it quits */
        /*TODO: 如果env->loop_max與env->loop_break相等,LBF_NEED_BREAK不就不會被置位了嗎,邏輯是否合理?############*/
        if (env->loop > env->loop_max)
            break;

        /* take a breather every nr_migrate tasks */
        /*
         * 當src rq上的任務數比較多的時候,並且需要遷移大量的任務才能完成均衡,為了減少關中斷的區間,
         * 遷移需要分段進行(每 sched_nr_migrate_break 暫停一下),把大的臨界區分成幾個小的臨界區,確保
         * 系統的延遲效能。
         */
        if (env->loop > env->loop_break) {
            env->loop_break += sched_nr_migrate_break;
            //外層函式load_balnace判斷這個標誌位後會重跳轉到從src rq摘取任務的邏輯處
            env->flags |= LBF_NEED_BREAK;
            break;
        }

        /*如果該任務不適合遷移,那麼將其移到 cfs_tasks 連結串列頭部*/
        if (!can_migrate_task(p, env))
            goto next; //放棄遷移此任務

        /* 下面就是任務p可被遷移到 dst cpu 的邏輯了 */
        /*
         * 下面判斷遷移該任務是否能達到均衡
         */
        switch (env->migration_type) {
        case migrate_load:
            /*
             * Depending of the number of CPUs and tasks and the
             * cgroup hierarchy, task_h_load() can return a null
             * value. Make sure that env->imbalance decreases
             * otherwise detach_tasks() will stop only after
             * detaching up to loop_max tasks.
             */
             /*計算該任務的負載。這裡設定任務的最小負載是1。*/
            load = max_t(unsigned long, task_h_load(p), 1);

            /*
             * LB_MIN特性限制遷移小任務,預設為false,如果LB_MIN等於true,那麼task load小於
             * 16的任務將不參與負載均衡。
             */
            if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
                goto next;

            /*
             * Make sure that we don't migrate too much load.
             * Nevertheless, let relax the constraint if
             * scheduler fails to find a good waiting task to migrate.
             *
             * 不要遷移過多的load,確保遷移的load不大於 env->imbalance。隨著遷移失敗的次增加,
             * 這個限制可以適當放寬一些。
             *
             * I: load >> env->sd->nr_balance_failed > env->imbalance
             */
            if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
                goto next;

            env->imbalance -= load;
            break;

        case migrate_util:
            /*
             * 對於migrate_util型別的遷移,我們通過任務的util和env->imbalance來判斷是否遷
             * 移了足夠的utility。需要注意的是這裡使用的是任務的util_est,沒有考慮uclamp。
             */
            util = task_util_est(p);

            if (util > env->imbalance)
                goto next;

            env->imbalance -= util;
            break;

        case migrate_task:
            /*
             * migrate_task型別的遷移不關注load或者utility,只關心遷移的任務數,此type下
             * env->imbalance儲存的也是要遷移的任務量
             */
            env->imbalance--;
            break;

        case migrate_misfit:
            /* This is not a misfit task */
            /*找到misfit task即完成遷移,若不是misfit的就放棄遷移它 */
            if (task_fits_capacity(p, capacity_of(env->src_cpu)))
                goto next;

            env->imbalance = 0;
            break;
        }

        /*
         * 程式執行至此,說明任務p需要被遷移(不能遷移的都跳轉到next標號了),此時才從tasks(env->src_rq->cfs_tasks)
         * 連結串列上摘取下來掛入 env->tasks 連結串列。
         */
        detach_task(p, env);
        list_add(&p->se.group_node, &env->tasks); //頭插法

        detached++;

#ifdef CONFIG_PREEMPTION
        /*
         * NEWIDLE balancing is a source of latency, so preemptible
         * kernels will stop after the first task is detached to minimize
         * the critical section.
         */
        /* new idle balance 是排程延遲的一個來源,所有對於 new idle balance,
         * 一次只遷移一個任務
         */
        if (env->idle == CPU_NEWLY_IDLE)
            break;
#endif

        /*
         * We only want to steal up to the prescribed amount of load/util/tasks.
         */
        /* 如果完成遷移,那麼就退出遍歷src rq的cfs task連結串列 */
        if (env->imbalance <= 0)
            break;

        continue;
next:
        /*對於不適合遷移的任務將其移動到連結串列頭部,因為是從尾部進行掃描判斷的*/
        list_move(&p->se.group_node, tasks);
    }

    /*
     * Right now, this is one of only two places we collect this stat
     * so we can safely collect detach_one_task() stats here rather
     * than inside detach_one_task().
     */
    schedstat_add(env->sd->lb_gained[env->idle], detached);

    return detached;
}

1. can_migrate_task()

用來判斷一個任務是否可以遷移至目標CPU

/*
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 */
static int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
    int tsk_cache_hot;
    int can_migrate = 1;

    lockdep_assert_held(&env->src_rq->lock);

    trace_android_rvh_can_migrate_task(p, env->dst_cpu, &can_migrate);
    if (!can_migrate)
        return 0;

    /*
     * We do not migrate tasks that are:
     * 1) throttled_lb_pair, or
     * 2) cannot be migrated to this CPU due to cpus_ptr, or
     * 3) running (obviously), or
     * 4) are cache-hot on their current CPU.
     */
    /*
     * 如果任務p所在的task group在src cpu 或 在dest cpu上被限流了,那麼不
     * 能遷移該任務,否者限流的邏輯會有問題.
     */
    if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
        return 0;

    /* Disregard per-cpu kthreads; they are where they need to be. */
    if ((p->flags & PF_KTHREAD) && kthread_is_per_cpu(p))
        return 0;

    //若dst cpu不在任務p的cpu親和性裡面
    if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
        int cpu;

        //統計由於cpu親和性不能遷移到dst cpu
        schedstat_inc(p->se.statistics.nr_failed_migrations_affine);

        /*
         * 任務由於affinity的原因不能在dest cpu上執行,因此這裡設定上
         * LBF_SOME_PINNED 標誌,表示至少有一個任務由於affinity無法遷移
         */
        env->flags |= LBF_SOME_PINNED;

        /*
         * Remember if this task can be migrated to any other CPU in
         * our sched_group. We may want to revisit it if we couldn't
         * meet load balance goals by pulling other tasks on src_cpu.
         *
         * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
         * already computed one in current iteration.
         */
        /*
         * 下面的邏輯會嘗試選擇備選dst cpu,如果是已經設定好了備選dst cpu
         * 那麼直接返回。如果是newidle balance那麼也不需要備選CPU,因為它的
         * 主要目標就是遷移一個任務到本idle的cpu。
         */
        if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
            return 0;

        /* Prevent to re-select dst_cpu via env's CPUs: */
        /*
         * 設定備選CPU,以便後續第二輪的均衡可以把任務遷移到備選CPU上
         * MC層級只有dst cpu一個,DIE層級是dst cpu所在cluster的所有cpu
         */
        for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
            if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
                env->flags |= LBF_DST_PINNED;
                env->new_dst_cpu = cpu;
                break;
            }
        }

        return 0;
    }

    /* 下面就是dst cpu 在 p->cpus_ptr 中了 */

    /* Record that we found atleast one task that could run on dst_cpu */
    /*至少有一個任務是可以執行在dest cpu上(從affinity角度),因此清除all pinned標記*/
    env->flags &= ~LBF_ALL_PINNED;

    /*正處於執行狀態的任務不參與遷移,遷移running task是後續 active migration 的邏輯。*/
    if (task_running(env->src_rq, p)) { //return p->on_cpu, TODO: 但是被搶佔的任務其p->on_cpu也是為真的
        schedstat_inc(p->se.statistics.nr_failed_migrations_running);
        return 0;
    }

    /*
     * Aggressive migration if:
     * 1) destination numa is preferred
     * 2) task is cache cold, or
     * 3) too many balance attempts have failed.
     */
    /*
     * 判斷該任務是否是cache-hot的,這主要從近期在src cpu上的執行時間點來判斷,如果上
     * 次任務在src cpu上開始執行的時間比較久遠(sysctl_sched_migration_cost 是門限,預設0.5ms),
     * 那麼其在cache中的內容大概率是被刷掉了,可以認為是cache-cold的。此外如果任務p是
     * src cpu上的next buddy或者last buddy,那麼任務是cache hot的。
     */
    tsk_cache_hot = migrate_degrades_locality(p, env); //沒有配置 CONFIG_NUMA_BALANCING 的話直接返回-1
    if (tsk_cache_hot == -1)
        tsk_cache_hot = task_hot(p, env);

    /*
     * 一般而言,我們只遷移cache cold的任務。但是如果進行了太多輪的嘗試仍然未能讓負
     * 載達到均衡,那麼cache hot的任務也一樣遷移。
     * sd_init()中MC和DIE的cache_nice_tries都初始化為1。
     * nr_balance_failed:load_balance中判斷非new idle balance且一個任務都沒遷移就加1
     */
    if (tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
        if (tsk_cache_hot == 1) {
            //由於上兩次嘗試一個任務都沒遷移成功,這次cache_hot的也遷移
            schedstat_inc(env->sd->lb_hot_gained[env->idle]);
            schedstat_inc(p->se.statistics.nr_forced_migrations);
        }
        return 1;
    }

    schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
    return 0;
}

八、attach_tasks——將從busiest rq上取下來的任務掛到dst cpu 上

attach_tasks主要的邏輯就是遍歷 env->tasks 連結串列,摘下任務掛入dst cpu的佇列

/*
 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their new rq.
 */
static void attach_tasks(struct lb_env *env)
{
    struct list_head *tasks = &env->tasks;
    struct task_struct *p;
    struct rq_flags rf;

    rq_lock(env->dst_rq, &rf);
    update_rq_clock(env->dst_rq);

    while (!list_empty(tasks)) {
        p = list_first_entry(tasks, struct task_struct, se.group_node);
        list_del_init(&p->se.group_node);

        attach_task(env->dst_rq, p);
    }

    rq_unlock(env->dst_rq, &rf);
}

九、need_active_balance() 判斷是否需要主動均衡

判斷是否需要啟動 active balance,就是判斷是否需要將src cpu當前正在running的任務遷移到dst cpu,因為前面一番折騰後發現無法遷移runnable的任務,那麼就再考慮一下running的任務。

(1) busiest cpu的算力被非CFS任務佔用的比較多,且dst cpu的剩餘算力比busiest cpu多出一定比例
(2) migration_type == migrate_misfit
(3) 該sd遷移runnable任務失敗次數比 sd->cache_nice_tries 多2次以上

static int need_active_balance(struct lb_env *env)
{
    struct sched_domain *sd = env->sd;

    if (voluntary_active_balance(env))
        return 1;

    /* 
     * 對於非new idle型別的balance,發現連一個runnable任務都無法遷移就加1
     * sd_init: MC和DIE的 cache_nice_tries 都初始化為1。
     */
    return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}


static inline bool voluntary_active_balance(struct lb_env *env)
{
    struct sched_domain *sd = env->sd;

    if (asym_active_balance(env))
        return 1;

    /*
     * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
     * It's worth migrating the task if the src_cpu's capacity is reduced
     * because of other sched_class or IRQs if more capacity stays
     * available on dst_cpu.
     */
    if ((env->idle != CPU_NOT_IDLE) &&
        (env->src_rq->cfs.h_nr_running == 1)) {
        //(src_rq->cpu_capacity < 85.5% * src_rq->cpu_capacity_orig) && (dst_cpu->cpu_capacity > 1.17*src_cpu->cpu_capacity)
        if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
            return 1;
    }

    if (env->migration_type == migrate_misfit)
        return 1;

    return 0;
}

static inline bool asym_active_balance(struct lb_env *env)
{
    /*
     * ASYM_PACKING needs to force migrate tasks from busy but
     * lower priority CPUs in order to pack all tasks in the
     * highest priority CPUs.
     */
    //DIE和MC都沒有使能 SD_ASYM_PACKING,恆返回false
    return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
           sched_asym_prefer(env->dst_cpu, env->src_cpu);
}

十、active_load_balance_cpu_stop()——主動遷移

stop_one_cpu_nowait() 中發起主動遷移,就是向 busiest cpu 的stop排程類的 "migration/X" 執行緒queue一個work,然後喚醒它,執行流程為:

per-cpu的cpu_stopper.thread --> smpboot_thread_fn --> cpu_stopper_thread --> fn(arg) --> active_load_balance_cpu_stop(busiest rq)

也就是說主動均衡函式執行在 stop 排程類的執行緒中,最高優先順序的執行緒。

/*
 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
 * running tasks off the busiest CPU onto idle CPUs. It requires at
 * least 1 task to be running on each physical CPU where possible, and
 * avoids physical / logical imbalances.
 */
//fair.c
static int active_load_balance_cpu_stop(void *data)
{
    struct rq *busiest_rq = data;
    int busiest_cpu = cpu_of(busiest_rq);
    int target_cpu = busiest_rq->push_cpu; //就是dst cpu
    struct rq *target_rq = cpu_rq(target_cpu);
    struct sched_domain *sd;
    struct task_struct *p = NULL;
    struct rq_flags rf;

    rq_lock_irq(busiest_rq, &rf); //src cpu 上關中斷
    /*
     * Between queueing the stop-work and running it is a hole in which
     * CPUs can become inactive. We should not move tasks from or to
     * inactive CPUs.
     * 翻譯:
     * 在queue stop-work和執行它之間有一個間隙,在這個間隙中cpu可以變為inactive
     * 狀態,我們不應該將任務遷移到inactive cpu或從 inactive cpu遷移任務。
     */
    if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
        goto out_unlock;

    /* Make sure the requested CPU hasn't gone down in the meantime: */
    /*
     * busiest_rq->active_balance: 在 load_balance() 觸發active balance之前就賦值為1了。
     * busiest_cpu == smp_processor_id() 應該是恆成立的,因為執行在busiest_cpu的per-cpu的"migration/X"中
    */
    if (unlikely(busiest_cpu != smp_processor_id() || !busiest_rq->active_balance))
        goto out_unlock;

    /* Is there any task to move? */
    /*只有stop排程類的"migration/X"在執行,沒有其它任何任務在運行了,rq->nr_running中也包括被搶佔的任務*/
    if (busiest_rq->nr_running <= 1)
        goto out_unlock;

    /*
     * This condition is "impossible", if it occurs we need to fix it. Originally reported by
     * Bjorn Helgaas on a 128-CPU setup.
     */
    BUG_ON(busiest_rq == target_rq);

    /* Search for an sd spanning us and the target CPU. */
    rcu_read_lock();
    //MC層級的若是能命中就是兩個cpu在同一個cluster中,否則不在
    for_each_domain(target_cpu, sd) {
        if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
            break;
    }

    if (likely(sd)) {
        struct lb_env env = {
            .sd            = sd,
            .dst_cpu    = target_cpu,
            .dst_rq        = target_rq,
            .src_cpu    = busiest_rq->cpu,
            .src_rq        = busiest_rq,
            .idle        = CPU_IDLE,
            /*
             * can_migrate_task() doesn't need to compute new_dst_cpu
             * for active balancing. Since we have CPU_IDLE, but no
             * @dst_grpmask we need to make that test go away with lying
             * about DST_PINNED.
             */
            .flags        = LBF_DST_PINNED,
            .src_rq_rf    = &rf,
        };

        //統計active balance的次數
        schedstat_inc(sd->alb_count);
        update_rq_clock(busiest_rq);

        p = detach_one_task(&env);
        if (p) {
            schedstat_inc(sd->alb_pushed);
            /* Active balancing done, reset the failure counter. */
            sd->nr_balance_failed = 0;
        } else {
            schedstat_inc(sd->alb_failed);
        }
    }
    rcu_read_unlock();
out_unlock:
    busiest_rq->active_balance = 0;
    rq_unlock(busiest_rq, &rf);

    if (p)
        attach_one_task(target_rq, p);

    local_irq_enable();

    return 0;
}

1. detach_one_task()

這是active balance使用,只從 src rq 上dequeue一個任務

/*
 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
 * part of active balancing operations within "domain".
 *
 * Returns a task if successful and NULL otherwise.
 */
static struct task_struct *detach_one_task(struct lb_env *env)
{
    struct task_struct *p;

    lockdep_assert_held(&env->src_rq->lock);

    //從後向前遍歷(遷移runnable任務時發現不能遷移的掛在連結串列頭了)
    list_for_each_entry_reverse(p, &env->src_rq->cfs_tasks, se.group_node) {
        if (!can_migrate_task(p, env))
            continue;

        detach_task(p, env);

        /*
         * Right now, this is only the second place where
         * lb_gained[env->idle] is updated (other is detach_tasks)
         * so we can safely collect stats here rather than
         * inside detach_tasks().
         */
        schedstat_inc(env->sd->lb_gained[env->idle]);
        return p;
    }
    return NULL;
}

2. attach_one_task()

active balance時使用,用於將一個任務attach到dst cpu上。

/*
 * attach_one_task() -- attaches the task returned from detach_one_task() to
 * its new rq.
 */
static void attach_one_task(struct rq *rq, struct task_struct *p)
{
    struct rq_flags rf;

    rq_lock(rq, &rf);
    update_rq_clock(rq);
    attach_task(rq, p);
    rq_unlock(rq, &rf);
}