Linux核心原始碼—CFS排程（4.20.17）

阿新 • • 發佈：2022-03-08

cfs_rq

每個 cpu 都有一個對應的執行佇列 rq，在 rq 中維護著不同調度策略的排程佇列。

struct rq {
        ...
    struct cfs_rq       cfs;
    struct rt_rq        rt;
    struct dl_rq        dl;
        ...   
};

cfs的排程佇列通過紅黑樹維護，在 cfs_rq 的資料結構中，struct rb_root_cached tasks_timeline 包含了紅黑樹 struct rb_root rb_root 和最左葉子節點快取 struct rb_node *rb_leftmost 。

struct cfs_rq {
	struct load_weight	load;  //CFS執行佇列的負載權重值
	unsigned long		runnable_weight;
	unsigned int		nr_running;
	unsigned int		h_nr_running;

	u64			exec_clock;
	u64			min_vruntime;
#ifndef CONFIG_64BIT
	u64			min_vruntime_copy;
#endif

	struct rb_root_cached	tasks_timeline;  //紅黑樹，維護排程實體

	/*
	 * 'curr' points to currently running entity on this cfs_rq.
	 * It is set to NULL otherwise (i.e when none are currently running).
	 */
	struct sched_entity	*curr;  //當前執行的排程實體
	struct sched_entity	*next;  //下一個排程實體
	struct sched_entity	*last;  //佇列中最後的排程實體
	struct sched_entity	*skip;  //跳過的排程實體

#ifdef	CONFIG_SCHED_DEBUG
	unsigned int		nr_spread_over;
#endif

#ifdef CONFIG_SMP
	/*
	 * CFS load tracking
	 */
	struct sched_avg	avg;
#ifndef CONFIG_64BIT
	u64			load_last_update_time_copy;
#endif
	struct {
		raw_spinlock_t	lock ____cacheline_aligned;
		int		nr;
		unsigned long	load_avg;
		unsigned long	util_avg;
		unsigned long	runnable_sum;
	} removed;

#ifdef CONFIG_FAIR_GROUP_SCHED
	unsigned long		tg_load_avg_contrib;
	long			propagate;
	long			prop_runnable_sum;

	/*
	 *   h_load = weight * f(tg)
	 *
	 * Where f(tg) is the recursive weight fraction assigned to
	 * this group.
	 */
	unsigned long		h_load;
	u64			last_h_load_update;
	struct sched_entity	*h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */

#ifdef CONFIG_FAIR_GROUP_SCHED
	struct rq		*rq;	/* CPU runqueue to which this cfs_rq is attached */

	/*
	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
	 * (like users, containers etc.)
	 *
	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
	 * This list is used during load balance.
	 */
	int			on_list;
	struct list_head	leaf_cfs_rq_list;
	struct task_group	*tg;	/* group that "owns" this runqueue */

#ifdef CONFIG_CFS_BANDWIDTH
	int			runtime_enabled;
	int			expires_seq;
	u64			runtime_expires;
	s64			runtime_remaining;

	u64			throttled_clock;
	u64			throttled_clock_task;
	u64			throttled_clock_task_time;
	int			throttled;
	int			throttle_count;
	struct list_head	throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};

vruntime

那麼CFS是根據什麼來對任務進行排序呢？----------》虛擬執行時間 vruntime。

update_curr 函式（/kernel/sched/fair.c）實現了 vruntime 的更新，其步驟是計算出當前程序的執行時間 delta_exec，再結合當前可執行程序總數對delta_exec 進行加權運算。

static void update_curr(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;  //獲取當前排程實體
	u64 now = rq_clock_task(rq_of(cfs_rq));  //獲取當前時間
	u64 delta_exec;

	if (unlikely(!curr))
		return;

	delta_exec = now - curr->exec_start;  //計算當前程序已執行的時間，exec_start是排程實體的開始執行時間
	if (unlikely((s64)delta_exec <= 0))
		return;

	curr->exec_start = now;

	schedstat_set(curr->statistics.exec_max,
		      max(delta_exec, curr->statistics.exec_max));

	curr->sum_exec_runtime += delta_exec;  //修改排程實體已執行總時間
	schedstat_add(cfs_rq->exec_clock, delta_exec);

	curr->vruntime += calc_delta_fair(delta_exec, curr);  //修改排程實體虛擬執行時間
	update_min_vruntime(cfs_rq);

	if (entity_is_task(curr)) {  //如果排程實體是task，也要給它的排程組記錄執行時間
		struct task_struct *curtask = task_of(curr);

		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
		cgroup_account_cputime(curtask, delta_exec);
		account_group_exec_runtime(curtask, delta_exec);
	}

	account_cfs_rq_runtime(cfs_rq, delta_exec);
}

calc_delta_fair(delta_exec, curr) 實現了虛擬執行時間的計算：

虛擬執行時間 = delta_exec * NICE_0_LOAD / 當前程序的權重

而具體在 __calc_delta 中，是通過(delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT 實現的，通過左移和右移避免浮點運算。

從公式可以得出，如果一個程序的虛擬執行時間越小，說明實際執行的時間越少或者是程序的權重大，那麼就應該具有更高的優先度。而紅黑樹維護的就是程序的 vruntime 值，每次選擇 vruntime 最小的程序執行，該節點快取在了最左葉子節點 struct rb_node *rb_leftmost 中。

static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
	if (unlikely(se->load.weight != NICE_0_LOAD))
		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);

	return delta;
}

程序選擇

在程序變為可執行狀態（被喚醒）或者是通過 fork() 呼叫第一次建立程序時，需要將程序插入紅黑樹，呼叫 __enqueue_entity 實現這一過程。刪除節點也是同樣的道理。

static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;  //紅黑樹根節點
	struct rb_node *parent = NULL;
	struct sched_entity *entry;
	bool leftmost = true;

	/*
	 * Find the right place in the rbtree:
	 */
	while (*link) {
		parent = *link;
		entry = rb_entry(parent, struct sched_entity, run_node);  //rb_entry 只是 container_of 的封裝而已，找到首地址
		/*
		 * We dont care about collisions. Nodes with
		 * the same key stay together.
		 */
		if (entity_before(se, entry)) {
			link = &parent->rb_left;
		} else {
			link = &parent->rb_right;
			leftmost = false;
		}
	}

	rb_link_node(&se->run_node, parent, link);  //在紅黑樹中插入節點
	rb_insert_color_cached(&se->run_node,  //設定節點的顏色
			       &cfs_rq->tasks_timeline, leftmost);
}

程序排程

程序排程的主要入口點是函式 schedule(/kernel/sched/core.c)，它通過 pick_next_task() 函式選擇下一個程序，如果選出來的程序與當前執行程序不一致，則呼叫 context_switch() 函式進行上下文切換。

static void __sched notrace __schedule(bool preempt)
{
	cpu = smp_processor_id();
	rq = cpu_rq(cpu);
	prev = rq->curr;  //獲取當前執行程序

        ...

	next = pick_next_task(rq, prev, &rf);
	clear_tsk_need_resched(prev);
	clear_preempt_need_resched();

	if (likely(prev != next)) {
                ...
		rq = context_switch(rq, prev, next, &rf);
	} else {
		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
		rq_unlock_irq(rq, &rf);
	}
        ...
}

pick_next_task() 函式的實現並不複雜，這裡用到了一點優化，如果所有的可執行程序都在 cfs 中，那麼就可以直接呼叫 cfs 的 pick_next_task()，否則就需要按照排程器的優先順序來選擇。

static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
	const struct sched_class *class;
	struct task_struct *p;

	/*
	 * Optimization: we know that if all tasks are in the fair class we can
	 * call that function directly, but only if the @prev task wasn't of a
	 * higher scheduling class, because otherwise those loose the
	 * opportunity to pull in more work from other CPUs.
	 */
	if (likely((prev->sched_class == &idle_sched_class ||
		    prev->sched_class == &fair_sched_class) &&
		   rq->nr_running == rq->cfs.h_nr_running)) {

		p = fair_sched_class.pick_next_task(rq, prev, rf);
		if (unlikely(p == RETRY_TASK))
			goto again;

		/* Assumes fair_sched_class->next == idle_sched_class */
		if (unlikely(!p))
			p = idle_sched_class.pick_next_task(rq, prev, rf);

		return p;
	}

again:
	for_each_class(class) {
		p = class->pick_next_task(rq, prev, rf);
		if (p) {
			if (unlikely(p == RETRY_TASK))
				goto again;
			return p;
		}
	}

	/* The idle class should always have a runnable task: */
	BUG();
}

References：

Linux核心原始碼—CFS排程（4.20.17）

cfs_rq

vruntime

程序選擇

程序排程

Linux核心原始碼—CFS排程（4.20.17）

Linux核心原始碼—程序排程（4.20.17）

Linux 核心：裝置樹（4）裝置樹中各個節點是誰轉換的

Dirty-Pipe Linux核心提權漏洞（CVE-2022-0847）

深入Linux 核心架構之 CFS linux核心分析——CFS（完全公平排程演算法）

探索SpringBoot-一起看看Spring核心原始碼之refresh（九）

探索SpringBoot-一起看看Spring核心原始碼之BeanFactory（七）

linux核心學習---準備工作（針對VM虛擬機器ubuntu系統磁碟空間不足進行硬碟容量擴充套件）

鴻蒙核心原始碼分析(排程機制篇)|解讀鴻蒙原始碼

Linux核心原始碼分析之set_arch (一)

Linux核心原始碼分析之setup_arch (二)

二本畢業三年，憑什麼拿到騰訊T8offer？——Linux核心面試題整理（含答案）

HBase 原始碼學習 ---- Flush（4）

原裝進口 + 幸福口感：德運全脂牛奶 20 斤 79 元新低（4 元 / 斤）

Linux 核心：裝置樹（1）dtb格式

MySQL優化原理分析及優化方案總結，linux核心原始碼詳解

Linux 核心中斷體系結構（1）

每日構造/DP（4.20）

linux高效能伺服器程式設計---第五章Linux網路程式設計基礎API （4）

Netty原始碼研究筆記（4）——EventLoop系列

Linux核心原始碼—CFS排程（4.20.17）

cfs_rq

vruntime

程序選擇

程序排程

相關推薦