Linux核心原始碼—CFS排程(4.20.17)
cfs_rq
每個 cpu 都有一個對應的執行佇列 rq,在 rq 中維護著不同調度策略的排程佇列。
struct rq { ... struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; ... };
cfs的排程佇列通過紅黑樹維護,在 cfs_rq 的資料結構中,struct rb_root_cached tasks_timeline 包含了紅黑樹 struct rb_root rb_root 和 最左葉子節點快取 struct rb_node *rb_leftmost 。
struct cfs_rq { struct load_weight load; //CFS執行佇列的負載權重值 unsigned long runnable_weight; unsigned int nr_running; unsigned int h_nr_running; u64 exec_clock; u64 min_vruntime; #ifndef CONFIG_64BIT u64 min_vruntime_copy; #endif struct rb_root_cached tasks_timeline; //紅黑樹,維護排程實體 /* * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr; //當前執行的排程實體 struct sched_entity *next; //下一個排程實體 struct sched_entity *last; //佇列中最後的排程實體 struct sched_entity *skip; //跳過的排程實體 #ifdef CONFIG_SCHED_DEBUG unsigned int nr_spread_over; #endif #ifdef CONFIG_SMP /* * CFS load tracking */ struct sched_avg avg; #ifndef CONFIG_64BIT u64 load_last_update_time_copy; #endif struct { raw_spinlock_t lock ____cacheline_aligned; int nr; unsigned long load_avg; unsigned long util_avg; unsigned long runnable_sum; } removed; #ifdef CONFIG_FAIR_GROUP_SCHED unsigned long tg_load_avg_contrib; long propagate; long prop_runnable_sum; /* * h_load = weight * f(tg) * * Where f(tg) is the recursive weight fraction assigned to * this group. */ unsigned long h_load; u64 last_h_load_update; struct sched_entity *h_load_next; #endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ /* * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in * a hierarchy). Non-leaf lrqs hold other higher schedulable entities * (like users, containers etc.) * * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. * This list is used during load balance. */ int on_list; struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; int expires_seq; u64 runtime_expires; s64 runtime_remaining; u64 throttled_clock; u64 throttled_clock_task; u64 throttled_clock_task_time; int throttled; int throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ };
vruntime
那麼CFS是根據什麼來對任務進行排序呢?----------》虛擬執行時間 vruntime。
update_curr 函式(/kernel/sched/fair.c)實現了 vruntime 的更新,其步驟是計算出當前程序的執行時間 delta_exec,再結合當前可執行程序總數對delta_exec 進行加權運算。
static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; //獲取當前排程實體 u64 now = rq_clock_task(rq_of(cfs_rq)); //獲取當前時間 u64 delta_exec; if (unlikely(!curr)) return; delta_exec = now - curr->exec_start; //計算當前程序已執行的時間,exec_start是排程實體的開始執行時間 if (unlikely((s64)delta_exec <= 0)) return; curr->exec_start = now; schedstat_set(curr->statistics.exec_max, max(delta_exec, curr->statistics.exec_max)); curr->sum_exec_runtime += delta_exec; //修改排程實體已執行總時間 schedstat_add(cfs_rq->exec_clock, delta_exec); curr->vruntime += calc_delta_fair(delta_exec, curr); //修改排程實體虛擬執行時間 update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { //如果排程實體是task,也要給它的排程組記錄執行時間 struct task_struct *curtask = task_of(curr); trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); cgroup_account_cputime(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } account_cfs_rq_runtime(cfs_rq, delta_exec); }
calc_delta_fair(delta_exec, curr) 實現了虛擬執行時間的計算:
虛擬執行時間 = delta_exec * NICE_0_LOAD / 當前程序的權重
而具體在 __calc_delta 中,是通過(delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT 實現的,通過左移和右移避免浮點運算。
從公式可以得出,如果一個程序的虛擬執行時間越小,說明實際執行的時間越少或者是程序的權重大,那麼就應該具有更高的優先度。而紅黑樹維護的就是程序的 vruntime 值,每次選擇 vruntime 最小的程序執行,該節點快取在了最左葉子節點 struct rb_node *rb_leftmost 中。
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) { if (unlikely(se->load.weight != NICE_0_LOAD)) delta = __calc_delta(delta, NICE_0_LOAD, &se->load); return delta; }
程序選擇
在程序變為可執行狀態(被喚醒)或者是通過 fork() 呼叫第一次建立程序時,需要將程序插入紅黑樹,呼叫 __enqueue_entity 實現這一過程。刪除節點也是同樣的道理。
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; //紅黑樹根節點 struct rb_node *parent = NULL; struct sched_entity *entry; bool leftmost = true; /* * Find the right place in the rbtree: */ while (*link) { parent = *link; entry = rb_entry(parent, struct sched_entity, run_node); //rb_entry 只是 container_of 的封裝而已,找到首地址 /* * We dont care about collisions. Nodes with * the same key stay together. */ if (entity_before(se, entry)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(&se->run_node, parent, link); //在紅黑樹中插入節點 rb_insert_color_cached(&se->run_node, //設定節點的顏色 &cfs_rq->tasks_timeline, leftmost); }
程序排程
程序排程的主要入口點是函式 schedule(/kernel/sched/core.c),它通過 pick_next_task() 函式選擇下一個程序,如果選出來的程序與當前執行程序不一致,則呼叫 context_switch() 函式進行上下文切換。
static void __sched notrace __schedule(bool preempt) { cpu = smp_processor_id(); rq = cpu_rq(cpu); prev = rq->curr; //獲取當前執行程序 ... next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); if (likely(prev != next)) {
... rq = context_switch(rq, prev, next, &rf); } else { rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); rq_unlock_irq(rq, &rf); }
... }
pick_next_task() 函式的實現並不複雜,這裡用到了一點優化,如果所有的可執行程序都在 cfs 中,那麼就可以直接呼叫 cfs 的 pick_next_task(), 否則就需要按照排程器的優先順序來選擇。
static inline struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { const struct sched_class *class; struct task_struct *p; /* * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a * higher scheduling class, because otherwise those loose the * opportunity to pull in more work from other CPUs. */ if (likely((prev->sched_class == &idle_sched_class || prev->sched_class == &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq, prev, rf); if (unlikely(p == RETRY_TASK)) goto again; /* Assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) p = idle_sched_class.pick_next_task(rq, prev, rf); return p; } again: for_each_class(class) { p = class->pick_next_task(rq, prev, rf); if (p) { if (unlikely(p == RETRY_TASK)) goto again; return p; } } /* The idle class should always have a runnable task: */ BUG(); }
References: