Kubernetes23--kube-scheduler原始碼--優選過程分析

阿新 • • 發佈：2018-12-26

kubernetes/pkg/scheduler/core/generic_scheduler.go

優選過程分析

優選函式入口

priorityList, err := PrioritizeNodes(pod, g.cachedNodeInfoMap, metaPrioritiesInterface, g.prioritizers, filteredNodes, g.extenders)

函式定義

func PrioritizeNodes(
	pod *v1.Pod,
	nodeNameToInfo map[string]*schedulercache.NodeInfo,
	meta interface{},
	priorityConfigs []algorithm.PriorityConfig,
	nodes []*v1.Node,
	extenders []algorithm.SchedulerExtender,
) (schedulerapi.HostPriorityList, error)

函式說明

// PrioritizeNodes prioritizes the nodes by running the individual priority functions in parallel.
// Each priority function is expected to set a score of 0-10
// 0 is the lowest priority score (least preferred node) and 10 is the highest
// Each priority function can also have its own weight
// The node scores returned by the priority function are multiplied by the weights to get weighted scores
// All scores are finally combined (added) to get the total weighted scores of all nodes

每一個優先順序函式返回0--10，並且有相應權重，優先順序函式平行計算，最終使用加權和作為最終Node節點得分數，返回優先順序Node列表

func PrioritizeNodes(
	pod *v1.Pod,
	nodeNameToInfo map[string]*schedulercache.NodeInfo,
	meta interface{},
	priorityConfigs []algorithm.PriorityConfig,
	nodes []*v1.Node,
	extenders []algorithm.SchedulerExtender,
) (schedulerapi.HostPriorityList, error) {
	// If no priority configs are provided, then the EqualPriority function is applied
	// This is required to generate the priority list in the required format
	if len(priorityConfigs) == 0 && len(extenders) == 0 {
		result := make(schedulerapi.HostPriorityList, 0, len(nodes))
		for i := range nodes {
			hostPriority, err := EqualPriorityMap(pod, meta, nodeNameToInfo[nodes[i].Name])
			if err != nil {
				return nil, err
			}
			result = append(result, hostPriority)
		}
		return result, nil
	}

	var (
		mu   = sync.Mutex{}
		wg   = sync.WaitGroup{}
		errs []error
	)
	appendError := func(err error) {
		mu.Lock()
		defer mu.Unlock()
		errs = append(errs, err)
	}

	results := make([]schedulerapi.HostPriorityList, len(priorityConfigs), len(priorityConfigs))

	// DEPRECATED: we can remove this when all priorityConfigs implement the
	// Map-Reduce pattern.
		for i := range priorityConfigs {
		if priorityConfigs[i].Function != nil {
			wg.Add(1)
			go func(index int) {
				defer wg.Done()
				var err error
				results[index], err = priorityConfigs[index].Function(pod, nodeNameToInfo, nodes)
				if err != nil {
					appendError(err)
			}
			}(i)
		} else {
			results[i] = make(schedulerapi.HostPriorityList, len(nodes))
		}
	}

	workqueue.ParallelizeUntil(context.TODO(), 16, len(nodes), func(index int) {
		nodeInfo := nodeNameToInfo[nodes[index].Name]
		for i := range priorityConfigs {
			if priorityConfigs[i].Function != nil {
				continue
			}

			var err error
			results[i][index], err = priorityConfigs[i].Map(pod, meta, nodeInfo)
			if err != nil {
				appendError(err)
				results[i][index].Host = nodes[index].Name
			}
		}
	})

	for i := range priorityConfigs {
		if priorityConfigs[i].Reduce == nil {
			continue
		}
		wg.Add(1)
		go func(index int) {
			defer wg.Done()
			if err := priorityConfigs[index].Reduce(pod, meta, nodeNameToInfo, results[index]); err != nil {
				appendError(err)
			}
			if klog.V(10) {
				for _, hostPriority := range results[index] {
					klog.Infof("%v -> %v: %v, Score: (%d)", util.GetPodFullName(pod), hostPriority.Host, priorityConfigs[index].Name, hostPriority.Score)
				}
			}
		}(i)
	}
	// Wait for all computations to be finished.
	wg.Wait()
	if len(errs) != 0 {
		return schedulerapi.HostPriorityList{}, errors.NewAggregate(errs)
	}

	// Summarize all scores.
	result := make(schedulerapi.HostPriorityList, 0, len(nodes))

	for i := range nodes {
		result = append(result, schedulerapi.HostPriority{Host: nodes[i].Name, Score: 0})
		for j := range priorityConfigs {
			result[i].Score += results[j][i].Score * priorityConfigs[j].Weight
		}
	}

	if len(extenders) != 0 && nodes != nil {
		combinedScores := make(map[string]int, len(nodeNameToInfo))
		for i := range extenders {
			if !extenders[i].IsInterested(pod) {
				continue
			}
			wg.Add(1)
			go func(extIndex int) {
				defer wg.Done()
				prioritizedList, weight, err := extenders[extIndex].Prioritize(pod, nodes)
				if err != nil {
					// Prioritization errors from extender can be ignored, let k8s/other extenders determine the priorities
					return
				}
				mu.Lock()
				for i := range *prioritizedList {
					host, score := (*prioritizedList)[i].Host, (*prioritizedList)[i].Score
					if klog.V(10) {
						klog.Infof("%v -> %v: %v, Score: (%d)", util.GetPodFullName(pod), host, extenders[extIndex].Name(), score)
					}
					combinedScores[host] += score * weight
				}
				mu.Unlock()
			}(i)
		}
		// wait for all go routines to finish
		wg.Wait()
		for i := range result {
			result[i].Score += combinedScores[result[i].Host]
		}
	}

	if klog.V(10) {
		for i := range result {
			klog.Infof("Host %s => Score %d", result[i].Host, result[i].Score)
		}
	}
	return result, nil
}

如果沒有優先順序配置，則執行相等權重函式來處理

// If no priority configs are provided, then the EqualPriority function is applied
	// This is required to generate the priority list in the required format
	if len(priorityConfigs) == 0 && len(extenders) == 0 {
		result := make(schedulerapi.HostPriorityList, 0, len(nodes))
		for i := range nodes {
			hostPriority, err := EqualPriorityMap(pod, meta, nodeNameToInfo[nodes[i].Name])
			if err != nil {
				return nil, err
			}
			result = append(result, hostPriority)
		}
		return result, nil
	}

// EqualPriorityMap is a prioritizer function that gives an equal weight of one to all nodes
func EqualPriorityMap(_ *v1.Pod, _ interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
	node := nodeInfo.Node()
	if node == nil {
		return schedulerapi.HostPriority{}, fmt.Errorf("node not found")
	}
	return schedulerapi.HostPriority{
		Host:  node.Name,
		Score: 1,
	}, nil
}

可知最終返回的HostPriority得分預設均為1

計算所有Node節點在該優先順序函式的得分，舊版本使用普通迴圈方式加非同步執行

// DEPRECATED: we can remove this when all priorityConfigs implement the
	// Map-Reduce pattern.
		for i := range priorityConfigs {
		if priorityConfigs[i].Function != nil {
			wg.Add(1)
			go func(index int) {
				defer wg.Done()
				var err error
				results[index], err = priorityConfigs[index].Function(pod, nodeNameToInfo, nodes)
				if err != nil {
					appendError(err)
			}
			}(i)
		} else {
			results[i] = make(schedulerapi.HostPriorityList, len(nodes))
		}
	}

// PriorityFunction is a function that computes scores for all nodes.
// DEPRECATED
// Use Map-Reduce pattern for priority functions.
type PriorityFunction func(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodes []*v1.Node) (schedulerapi.HostPriorityList, error)

新版本使用Map-Reduce模式來實現 map過程啟動16個程序

workqueue.ParallelizeUntil(context.TODO(), 16, len(nodes), func(index int) {
		nodeInfo := nodeNameToInfo[nodes[index].Name]
		for i := range priorityConfigs {
			if priorityConfigs[i].Function != nil {
				continue
			}

			var err error
			results[i][index], err = priorityConfigs[i].Map(pod, meta, nodeInfo)
			if err != nil {
				appendError(err)
				results[i][index].Host = nodes[index].Name
			}
		}
	})

	for i := range priorityConfigs {
		if priorityConfigs[i].Reduce == nil {
			continue
		}
		wg.Add(1)
		go func(index int) {
			defer wg.Done()
			if err := priorityConfigs[index].Reduce(pod, meta, nodeNameToInfo, results[index]); err != nil {
				appendError(err)
			}
			if klog.V(10) {
				for _, hostPriority := range results[index] {
					klog.Infof("%v -> %v: %v, Score: (%d)", util.GetPodFullName(pod), hostPriority.Host, priorityConfigs[index].Name, hostPriority.Score)
				}
			}
		}(i)
	}

map以及reduce函式定義如下

// PriorityMapFunction is a function that computes per-node results for a given node.
// TODO: Figure out the exact API of this method.
// TODO: Change interface{} to a specific type.
type PriorityMapFunction func(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error)

// PriorityReduceFunction is a function that aggregated per-node results and computes
//// final scores for all nodes.
// TODO: Figure out the exact API of this method.
// TODO: Change interface{} to a specific type.
type PriorityReduceFunction func(pod *v1.Pod, meta interface{}, nodeNameToInfo map[string]*schedulercache.NodeInfo, result schedulerapi.HostPriorityList) error

等到計算結束，彙總計算每一個Node的總得分

// Wait for all computations to be finished.
	wg.Wait()
	if len(errs) != 0 {
		return schedulerapi.HostPriorityList{}, errors.NewAggregate(errs)
	}

	// Summarize all scores.
	result := make(schedulerapi.HostPriorityList, 0, len(nodes))

	for i := range nodes {
		result = append(result, schedulerapi.HostPriority{Host: nodes[i].Name, Score: 0})
		for j := range priorityConfigs {
			result[i].Score += results[j][i].Score * priorityConfigs[j].Weight
		}
	}

對於演算法的擴充套件功能計算實現SchedulerExtender

if len(extenders) != 0 && nodes != nil {
		combinedScores := make(map[string]int, len(nodeNameToInfo))
		for i := range extenders {
			if !extenders[i].IsInterested(pod) {
				continue
			}
			wg.Add(1)
			go func(extIndex int) {
				defer wg.Done()
				prioritizedList, weight, err := extenders[extIndex].Prioritize(pod, nodes)
				if err != nil {
					// Prioritization errors from extender can be ignored, let k8s/other extenders determine the priorities
					return
				}
				mu.Lock()
				for i := range *prioritizedList {
					host, score := (*prioritizedList)[i].Host, (*prioritizedList)[i].Score
					if klog.V(10) {
						klog.Infof("%v -> %v: %v, Score: (%d)", util.GetPodFullName(pod), host, extenders[extIndex].Name(), score)
					}
					combinedScores[host] += score * weight
				}
				mu.Unlock()
			}(i)
		}
		// wait for all go routines to finish
		wg.Wait()
		for i := range result {
			result[i].Score += combinedScores[result[i].Host]
		}
	}

SchedulerExtender介面以後需要研究一下

優先順序函式實現分析

優先順序函式實現位置 kubernetes/pkg/scheduler/algorithm/priorities

1.BalancedResourceAllocation 資源均衡利用計算cpu，mem, volumn利用率的方差

// BalancedResourceAllocationMap favors nodes with balanced resource usage rate.
// BalancedResourceAllocationMap should **NOT** be used alone, and **MUST** be used together
// with LeastRequestedPriority. It calculates the difference between the cpu and memory fraction
// of capacity, and prioritizes the host based on how close the two metrics are to each other.

balancedResourcePriority = &ResourceAllocationPriority{"BalancedResourceAllocation", balancedResourceScorer}

具體實現

func balancedResourceScorer(requested, allocable *schedulercache.Resource, includeVolumes bool, requestedVolumes int, allocatableVolumes int) int64 {
	cpuFraction := fractionOfCapacity(requested.MilliCPU, allocable.MilliCPU)
	memoryFraction := fractionOfCapacity(requested.Memory, allocable.Memory)
	// This to find a node which has most balanced CPU, memory and volume usage.
	if includeVolumes && utilfeature.DefaultFeatureGate.Enabled(features.BalanceAttachedNodeVolumes) && allocatableVolumes > 0 {
		volumeFraction := float64(requestedVolumes) / float64(allocatableVolumes)
			if cpuFraction >= 1 || memoryFraction >= 1 || volumeFraction >= 1 {
			// if requested >= capacity, the corresponding host should never be preferred.
			return 0
		}
		// Compute variance for all the three fractions.
		mean := (cpuFraction + memoryFraction + volumeFraction) / float64(3)
		variance := float64((((cpuFraction - mean) * (cpuFraction - mean)) + ((memoryFraction - mean) * (memoryFraction - mean)) + ((volumeFraction - mean) * (volumeFraction - mean))) / float64(3))
		// Since the variance is between positive fractions, it will be positive fraction. 1-variance lets the
		// score to be higher for node which has least variance and multiplying it with 10 provides the scaling
		//		// factor needed.
		return int64((1 - variance) * float64(schedulerapi.MaxPriority))
	}

	if cpuFraction >= 1 || memoryFraction >= 1 {
		// if requested >= capacity, the corresponding host should never be preferred.
		return 0
	}
	// Upper and lower boundary of difference between cpuFraction and memoryFraction are -1 and 1
	// respectively. Multiplying the absolute value of the difference by 10 scales the value to
	// 0-10 with 0 representing well balanced allocation and 10 poorly balanced. Subtracting it from
	// 10 leads to the score which also scales from 0 to 10 while 10 representing well balanced.
	diff := math.Abs(cpuFraction - memoryFraction)
	return int64((1 - diff) * float64(schedulerapi.MaxPriority))
}

func fractionOfCapacity(requested, capacity int64) float64 {
	if capacity == 0 {
		return 1
	}
	return float64(requested) / float64(capacity)
}

分為兩種模式，第一種計算目錄使用率，第二種只計算cpu以及記憶體使用率。計算各種指標的使用率，計算均值，計算方差

cpuFraction := fractionOfCapacity(requested.MilliCPU, allocable.MilliCPU)

memoryFraction := fractionOfCapacity(requested.Memory, allocable.Memory)

volumeFraction := float64(requestedVolumes) / float64(allocatableVolumes)

mean := (cpuFraction + memoryFraction + volumeFraction) / float64(3)
variance := float64((((cpuFraction - mean) * (cpuFraction - mean)) + ((memoryFraction - mean) * (memoryFraction - mean)) + ((volumeFraction - mean) * (volumeFraction - mean))) / float64(3))

return int64((1 - variance) * float64(schedulerapi.MaxPriority))

2.ImageLocalityPriority 根據需要的映象在Node節點已經存在的數量，因為映象如果不存在則需要到倉庫拉取，這樣時間較長

// ImageLocalityPriorityMap is a priority function that favors nodes that already have requested pod container's images.
// It will detect whether the requested images are present on a node, and then calculate a score ranging from 0 to 10
// based on the total size of those images.
// - If none of the images are present, this node will be given the lowest priority.
// - If some of the images are present on a node, the larger their sizes' sum, the higher the node's priority.

func ImageLocalityPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
	node := nodeInfo.Node()
	if node == nil {
		return schedulerapi.HostPriority{}, fmt.Errorf("node not found")
	}

	var score int
	if priorityMeta, ok := meta.(*priorityMetadata); ok {
		score = calculatePriority(sumImageScores(nodeInfo, pod.Spec.Containers, priorityMeta.totalNumNodes))
	} else {
		// if we are not able to parse priority meta data, skip this priority
		score = 0
	}

	return schedulerapi.HostPriority{
		Host:  node.Name,
		Score: score,
	}, nil
}

計算一個映象的得分

spread := float64(imageState.NumNodes) / float64(totalNumNodes)
int64(float64(imageState.Size) * spread)

計算一個Node節點所有映象的得分

sum += scaledImageScore(state, totalNumNodes)

轉換得分到區間0--10

int(int64(schedulerapi.MaxPriority) * (sumScores - minThreshold) / (maxThreshold - minThreshold))

3.LeastResourceAllocation cpu以及記憶體的平均cpu利用率

leastResourcePriority = &ResourceAllocationPriority{"LeastResourceAllocation", leastResourceScorer}

func leastResourceScorer(requested, allocable *schedulercache.Resource, includeVolumes bool, requestedVolumes int, allocatableVolumes int) int64 {
	return (leastRequestedScore(requested.MilliCPU, allocable.MilliCPU) +
		leastRequestedScore(requested.Memory, allocable.Memory)) / 2
}

計算可用的資源利用率

func leastRequestedScore(requested, capacity int64) int64 {
	if capacity == 0 {
		return 0
	}
	if requested > capacity {
		return 0
	}

	return ((capacity - requested) * int64(schedulerapi.MaxPriority)) / capacity
}

計算資源可用率

((capacity - requested) * int64(schedulerapi.MaxPriority)) / capacity

計算cpu以及記憶體可用利用率均值

(leastRequestedScore(requested.MilliCPU, allocable.MilliCPU) + leastRequestedScore(requested.Memory, allocable.Memory)) / 2

其餘優先順序函式可以參考如下：

kubernetes/pkg/scheduler/algorithm/priorities

Kubernetes23--kube-scheduler原始碼--優選過程分析

kubernetes/pkg/scheduler/core/generic_scheduler.go 優選過程分析優選函式入口 priorityList, err := PrioritizeNodes(pod, g.cachedNodeInfoMap, metaPrioritiesIn

105 - kube-scheduler原始碼分析 - predicate演算法註冊

一、predicate註冊過程今天我們來聊聊predicate函式是怎麼被註冊進去的，也就是要執行的一堆predicate是怎麼成為“選中的孩子”。程式碼位置：pkg/scheduler/factory/plugins.go:111

【kubernetes/k8s原始碼分析】kube-scheduler 原始碼分析

前言在 kubernetes 體系中，scheduler 是唯一一個以 plugin 形式存在的模組，這種可插拔的設計方便使用者自定義所需要的排程演算法，所以原始碼路徑為 plugin 目錄下

nova-scheduler模組排程過程分析

openstack在建立虛擬機器或進行虛擬機器的冷遷移時根據在nova.conf檔案中scheduler_default_filters和scheduler_available_filters配置的過濾器，對主機進行篩選，選擇合適的目的主機。本文根據nova M版原始碼分析排程不同過濾器

【原創】k8s原始碼分析-----kube-scheduler

原始碼為k8s v1.1.1穩定版本一、主要流程 1、main入口原始碼在k8s.io/kubernetes/plugin/cmd/kube-scheduler 這種封裝是k8s

Android原始碼解析之應用程式資源管理器（Asset Manager）的建立過程分析

轉載自：https://blog.csdn.net/luoshengyang/article/details/8791064 我們分析了Android應用程式資源的編譯和打包過程，最終得到的應用程式資源就與應用程式程式碼一起打包在一個APK檔案中。Android應用程式在執行的過程中，是通過一個

spring MVC執行過程分析與原始碼實現

spring mvc 啟動過程可分為如下3步： ============================================= web.xml Dispa

Rxjava2原始碼分析（一）：Flowable的建立和基本使用過程分析

本文用於記錄一下自己學習Rxjava2原始碼的過程。首先是最簡單的一個使用方法（未做執行緒切換），用來學習Flowable的建立和使用。Flowable .create(new FlowableOnSubscribe<Object>() {

【Elasticsearch 5.6.12 原始碼】——【3】啟動過程分析（下）

版權宣告：本文為博主原創，轉載請註明出處！簡介本文主要解決以下問題： 1、ES啟動過程中的Node物件都初始化了那些服務？構造流程 Step 1、建立一個List暫存初始化失敗時需要釋放的資源，並使用臨時的Logger物件輸出開始初始化的日誌。這裡首先建立了一個List

【Elasticsearch 5.6.12 原始碼】——【2】啟動過程分析（上）

版權宣告：本文為博主原創，轉載請註明出處！簡介本文主要解決以下問題： 1、啟動ES的入口類及入口方法是哪個？2、分析梳理ES服務啟動的主要流程？入口類 ES的入口類為org.elasticsearch.bootstrap.Elasticsearch，啟動方法為： public

MyBatis原始碼閱讀——MyBatis對事務的處理過程分析

事務管理器在 MyBatis 中有兩種型別的事務管理器（也就是 type=”[JDBC|MANAGED]”）： <environments default="development"> <environment id="

android原始碼4.4.2----系統啟動過程分析

public class SystemServer { private static final String TAG = "SystemServer"; public static final int FACTORY_TEST_OFF = 0; public static final int F

Android筆記-service啟動過程分析：bindService原始碼分析、startService和bindService區別

前言： Service的啟動流程將會分為一個系列來講述。本系列開始將分析Service的啟動過程。看這個系列文章之前你所需要知道的知識點： 1. 熟悉service的基本用法。 2. 瞭解bind機制，知道android的客戶端和AMS間通

uboot原始碼及啟動過程分析

#include #include /* ************************************************************************* * * Jump vector table as in table 3.1 in [1] * *

RPC框架（八）dubbo原始碼分析--dubbo呼叫過程分析

一、概述消費端呼叫遠端服務介面時，使用上和呼叫普通的java介面是沒有任何區別，但是服務消費者和提供者是跨JVM和主機的，客戶端如何封裝請求讓服務端理解請求並且解析服務端返回的介面呼叫結果，服務端如何解析客戶端的請求並且向客戶端返回呼叫結果，這些框

基於RobHess原始碼的sift全景影象合成過程分析

在我改進的RobHess原始碼中，拼接的順序大致如下： 1.讀取圖片，使用opencv的undistort函式來對攝像機拍攝的圖片進行畸變矯正。 2.為每幅圖想呼叫sift_features函式，檢測特徵點。 3.用第i幅圖的特徵點建立kd樹，用第i+1幅

Openstack nova-scheduler 原始碼分析 — Filters/Weighting

目錄前言本篇記錄了 Openstack 在建立 Instances 時，nova-scheduler 作為排程器的工作原理和程式碼實現。 Openstack 中會由多個的 Instance 共享同一個 Host，而不是獨佔。所以就需要使用排

【kubernetes/k8s原始碼分析】kube proxy原始碼分析

本文再次於2018年11月15日再次編輯，基於1.12版本，包括IPVS 序言 kube-proxy管理sevice的Endpoints，service對外暴露一個Virtual IP（Cluster IP）, 叢集內Cluster IP:Port就能訪問到叢集內對應

Kubernetes Scheduler原始碼分析

本文是對Kubernetes 1.5的Scheduler原始碼層面的剖析，包括對應的原始碼目錄結構分析、kube-scheduler執行機制分析、整體程式碼流程圖、核心程式碼走讀分析等內容。閱讀本文前，請先了解kubernetes scheduler原理解析。

mybatis原始碼學習之執行過程分析（2）——config.xml配置檔案和mapper.xml對映檔案解析過程

在上一篇中跟蹤了SqlSessionFactory及SqlSession的建立過程。這一篇，主要跟蹤Mapper介面和XML檔案對映及獲取。 1.xml檔案的解析 1.1Mybatis-config.xml的解析在SqlSessionFactor

Kubernetes23--kube-scheduler原始碼--優選過程分析

優選過程分析

優先順序函式實現分析

相關推薦