Skip to content

Commit 1ab5c99

Browse files
authored
koord-scheduler: support default preferredCPUBindPolicy for LSE/LSR Pod if not specified (#354)
Signed-off-by: Joseph <joseph.t.lee@outlook.com>
1 parent 171ad3e commit 1ab5c99

File tree

8 files changed

+159
-49
lines changed

8 files changed

+159
-49
lines changed

apis/extension/resource.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ type ResourceStatus struct {
6464
type CPUBindPolicy = schedulingconfig.CPUBindPolicy
6565

6666
const (
67-
// CPUBindPolicyNone does not perform any bind policy
68-
CPUBindPolicyNone CPUBindPolicy = schedulingconfig.CPUBindPolicyNone
67+
// CPUBindPolicyDefault performs the default bind policy that specified in koord-scheduler configuration
68+
CPUBindPolicyDefault CPUBindPolicy = schedulingconfig.CPUBindPolicyDefault
6969
// CPUBindPolicyFullPCPUs favor cpuset allocation that pack in few physical cores
7070
CPUBindPolicyFullPCPUs CPUBindPolicy = schedulingconfig.CPUBindPolicyFullPCPUs
7171
// CPUBindPolicySpreadByPCPUs favor cpuset allocation that evenly allocate logical cpus across physical cores
@@ -85,7 +85,7 @@ type CPUSharedPool struct {
8585
// GetResourceSpec parses ResourceSpec from annotations
8686
func GetResourceSpec(annotations map[string]string) (*ResourceSpec, error) {
8787
resourceSpec := &ResourceSpec{
88-
PreferredCPUBindPolicy: schedulingconfig.CPUBindPolicyNone,
88+
PreferredCPUBindPolicy: schedulingconfig.CPUBindPolicyDefault,
8989
}
9090
data, ok := annotations[AnnotationResourceSpec]
9191
if !ok {

apis/scheduling/config/types.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,17 +73,17 @@ type ScoringStrategy struct {
7373
type NodeNUMAResourceArgs struct {
7474
metav1.TypeMeta
7575

76-
PreferredCPUBindPolicy CPUBindPolicy `json:"preferredCPUBindPolicy,omitempty"`
77-
NUMAAllocateStrategy NUMAAllocateStrategy `json:"numaAllocateStrategy,omitempty"`
78-
ScoringStrategy *ScoringStrategy `json:"scoringStrategy,omitempty"`
76+
DefaultCPUBindPolicy CPUBindPolicy `json:"defaultCPUBindPolicy,omitempty"`
77+
NUMAAllocateStrategy NUMAAllocateStrategy `json:"numaAllocateStrategy,omitempty"`
78+
ScoringStrategy *ScoringStrategy `json:"scoringStrategy,omitempty"`
7979
}
8080

8181
// CPUBindPolicy defines the CPU binding policy
8282
type CPUBindPolicy string
8383

8484
const (
85-
// CPUBindPolicyNone does not perform any bind policy
86-
CPUBindPolicyNone CPUBindPolicy = "None"
85+
// CPUBindPolicyDefault performs the default bind policy that specified in koord-scheduler configuration
86+
CPUBindPolicyDefault CPUBindPolicy = "Default"
8787
// CPUBindPolicyFullPCPUs favor cpuset allocation that pack in few physical cores
8888
CPUBindPolicyFullPCPUs CPUBindPolicy = "FullPCPUs"
8989
// CPUBindPolicySpreadByPCPUs favor cpuset allocation that evenly allocate logical cpus across physical cores

apis/scheduling/config/v1beta2/defaults.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ func SetDefaults_LoadAwareSchedulingArgs(obj *LoadAwareSchedulingArgs) {
7474

7575
// SetDefaults_NodeNUMAResourceArgs sets the default parameters for NodeNUMANodeResource plugin.
7676
func SetDefaults_NodeNUMAResourceArgs(obj *NodeNUMAResourceArgs) {
77-
if obj.PreferredCPUBindPolicy == "" {
78-
obj.PreferredCPUBindPolicy = defaultPreferredCPUBindPolicy
77+
if obj.DefaultCPUBindPolicy == "" {
78+
obj.DefaultCPUBindPolicy = defaultPreferredCPUBindPolicy
7979
}
8080
if obj.NUMAAllocateStrategy == "" {
8181
obj.NUMAAllocateStrategy = defaultNUMAAllocateStrategy

apis/scheduling/config/v1beta2/types.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,9 @@ type ScoringStrategy struct {
7373
type NodeNUMAResourceArgs struct {
7474
metav1.TypeMeta `json:",inline"`
7575

76-
PreferredCPUBindPolicy CPUBindPolicy `json:"preferredCPUBindPolicy,omitempty"`
77-
NUMAAllocateStrategy NUMAAllocateStrategy `json:"numaAllocateStrategy,omitempty"`
78-
ScoringStrategy *ScoringStrategy `json:"scoringStrategy,omitempty"`
76+
DefaultCPUBindPolicy CPUBindPolicy `json:"defaultCPUBindPolicy,omitempty"`
77+
NUMAAllocateStrategy NUMAAllocateStrategy `json:"numaAllocateStrategy,omitempty"`
78+
ScoringStrategy *ScoringStrategy `json:"scoringStrategy,omitempty"`
7979
}
8080

8181
// CPUBindPolicy defines the CPU binding policy

apis/scheduling/config/v1beta2/zz_generated.conversion.go

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/proposals/scheduling/20220530-fine-grained-cpu-orchestration.md

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ reviewers:
1111
- "@stormgbs"
1212
- "@zwzhang0107"
1313
creation-date: 2022-05-30
14-
last-updated: 2022-06-24
14+
last-updated: 2022-07-11
1515
status: provisional
1616

1717
---
@@ -190,8 +190,8 @@ type ResourceSpec struct {
190190
type CPUBindPolicy string
191191

192192
const (
193-
// CPUBindPolicyNone does not perform any bind policy
194-
CPUBindPolicyNone CPUBindPolicy = "None"
193+
// CPUBindPolicyDefault performs the default bind policy that specified in koord-scheduler configuration
194+
CPUBindPolicyDefault CPUBindPolicy = "Default"
195195
// CPUBindPolicyFullPCPUs favor cpuset allocation that pack in few physical cores
196196
CPUBindPolicyFullPCPUs CPUBindPolicy = "FullPCPUs"
197197
// CPUBindPolicySpreadByPCPUs favor cpuset allocation that evenly allocate logical cpus across physical cores
@@ -203,8 +203,8 @@ const (
203203
type CPUExclusivePolicy string
204204

205205
const (
206-
// CPUExclusivePolicyNone does not perform any exclusive policy
207-
CPUExclusivePolicyNone CPUExclusivePolicy = "None"
206+
// CPUExclusivePolicyDefault performs the default exclusive policy that specified in koord-scheduler configuration
207+
CPUExclusivePolicyDefault CPUExclusivePolicy = "Default"
208208
// CPUExclusivePolicyPCPULevel represents mutual exclusion in the physical core dimension
209209
CPUExclusivePolicyPCPULevel CPUExclusivePolicy = "PCPULevel"
210210
// CPUExclusivePolicyNUMANodeLevel indicates mutual exclusion in the NUMA topology dimension
@@ -213,13 +213,13 @@ const (
213213
```
214214

215215
- The `CPUBindPolicy` defines the CPU binding policy. The specific values are defined as follows:
216-
- `CPUBindPolicyNone` or empty value does not perform any bind policy. It is completely determined by the scheduler plugin configuration.
216+
- `CPUBindPolicyDefault` or empty value performs the default bind policy that specified in koord-scheduler configuration.
217217
- `CPUBindPolicyFullPCPUs` is a bin-packing policy, similar to the `full-pcpus-only=true` option defined by the kubelet, that allocate full physical cores. However, if the number of remaining logical CPUs in the node is sufficient but the number of full physical cores is insufficient, the allocation will continue. This policy can effectively avoid the noisy neighbor problem.
218218
- `CPUBindPolicySpreadByPCPUs` is a spread policy. If the node enabled Hyper-Threading, when this policy is adopted, the scheduler will evenly allocate logical CPUs across physical cores. For example, the current node has 8 physical cores and 16 logical CPUs. When a Pod requires 8 logical CPUs and the `CPUBindPolicySpreadByPCPUs` policy is adopted, the scheduler will allocate an logical CPU from each physical core. This policy is mainly used by some latency-sensitive applications with multiple different peak-to-valley characteristics. It can not only allow the application to fully use the CPU at certain times, but will not be disturbed by the application on the same physical core. So the noisy neighbor problem may arise when using this policy.
219219
- `CPUBindPolicyConstrainedBurst` a special policy that mainly helps K8s Burstable/Koordinator LS Pod get better performance. When using the policy, koord-scheduler is filtering out Nodes that have NUMA Nodes with suitable CPU Shared Pool by Pod Limit. After the scheduling is successful, the scheduler will update `scheduling.koordinator.sh/resource-status` in the Pod, declaring the `CPU Shared Pool` to be bound. The koordlet binds the CPU Shared Pool of the corresponding NUMA Node according to the `CPU Shared Pool`
220220
- If `kubelet.koordinator.sh/cpu-manager-policy` in `NodeResourceTopology` has option `full-pcpus-only=true`, or `node.koordinator.sh/cpu-bind-policy` in the Node with the value `PCPUOnly`, the koord-scheduler will check whether the number of CPU requests of the Pod meets the `SMT-alignment` requirements, so as to avoid being rejected by the kubelet after scheduling. koord-scheduler will avoid such nodes if the Pod uses the `CPUBindPolicySpreadByPCPUs` policy or the number of logical CPUs mapped to the number of physical cores is not an integer.
221221
- The `CPUExclusivePolicy` defines the CPU exclusive policy, it can help users to avoid noisy neighbor problems. The specific values are defined as follows:
222-
- `CPUExclusivePolicyNone` or empty value does not perform any isolate policy. It is completely determined by the scheduler plugin configuration.
222+
- `CPUExclusivePolicyDefault` or empty value performs the default exclusive policy that specified in koord-scheduler configuration.
223223
- `CPUExclusivePolicyPCPULevel`. When allocating logical CPUs, try to avoid physical cores that have already been applied for by the same exclusive policy. It is a supplement to the `CPUBindPolicySpreadByPCPUs` policy.
224224
- `CPUExclusivePolicyNUMANodeLevel`. When allocating logical CPUs, try to avoid NUMA Nodes that has already been applied for by the same exclusive policy. If there is no NUMA Node that satisfies the policy, downgrade to `PCPU` policy.
225225

@@ -646,8 +646,7 @@ The following is an approximate brief algorithm logic:
646646
type CPUOrchestrationPluginArgs struct {
647647
metav1.TypeMeta
648648

649-
PreferredCPUBindPolicy CPUBindPolicy `json:"preferredCPUBindPolicy,omitempty"`
650-
PreferredCPUExclusivePolicy CPUExclusivePolicy `json:"preferredCPUExclusivePolicy,omitempty"`
649+
DefaultCPUBindPolicy CPUBindPolicy `json:"defaultCPUBindPolicy,omitempty"`
651650
NUMATopologyAlignmentPolicy NUMATopologyAlignmentPolicy `json:"numaTopologyAlignmentPolicy,omitempty"`
652651
NUMAAllocateStrategy NUMAAllocateStrategy `json:"numaAllocateStrategy,omitempty"`
653652

@@ -677,11 +676,10 @@ type ScoringStrategy struct {
677676
}
678677
```
679678

680-
- `CPUBindPolicy` represents the default bind policy. If not set, use `PCPUFirst` as default value.
681-
- `CPUExclusivePolicy` represents the default exclusive policy. There is no default value.
679+
- `DefaultCPUBindPolicy` represents the default bind policy. If not set, use `FullPCPUs` as default value.
682680
- `NUMATopologyAlignmentPolicy` represents the default NUMA topology alignment policy, If not set, use `BestEffort` as default value.
683-
- `ScoringStrategy` represents the node resource scoring strategy. If not set, use `MostAllocated` as default value.
684681
- `NUMAAllocateStrategy` represents the default NUMA allocate strategy. If not set, use `MostAllocated` as default value.
682+
- `ScoringStrategy` represents the node resource scoring strategy. If not set, use `MostAllocated` as default value.
685683

686684
## Alternatives
687685

@@ -702,3 +700,4 @@ type ScoringStrategy struct {
702700
- Add details about how to process newly created K8s Guaranteed Pod
703701
- Support Burstable Pod staticly bind CPU
704702
- 2022-06-24: Fix typo
703+
- 2022-07-11: Adjust CPUBindPolicyNone to CPUBindPolicyDefault

pkg/scheduler/plugins/nodenumaresource/plugin.go

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,11 @@ func New(args runtime.Object, handle framework.Handle) (framework.Plugin, error)
105105
func (p *Plugin) Name() string { return Name }
106106

107107
type preFilterState struct {
108-
skip bool
109-
resourceSpec *extension.ResourceSpec
110-
numCPUsNeeded int
111-
allocatedCPUs CPUSet
108+
skip bool
109+
resourceSpec *extension.ResourceSpec
110+
preferredCPUBindPolicy extension.CPUBindPolicy
111+
numCPUsNeeded int
112+
allocatedCPUs CPUSet
112113
}
113114

114115
func (s *preFilterState) Clone() framework.StateData {
@@ -132,8 +133,12 @@ func (p *Plugin) PreFilter(ctx context.Context, cycleState *framework.CycleState
132133
qosClass := extension.GetPodQoSClass(pod)
133134
priorityClass := extension.GetPriorityClass(pod)
134135
if (qosClass == extension.QoSLSE || qosClass == extension.QoSLSR) && priorityClass == extension.PriorityProd {
135-
if resourceSpec.PreferredCPUBindPolicy == schedulingconfig.CPUBindPolicyFullPCPUs ||
136-
resourceSpec.PreferredCPUBindPolicy == schedulingconfig.CPUBindPolicySpreadByPCPUs {
136+
preferredCPUBindPolicy := resourceSpec.PreferredCPUBindPolicy
137+
if preferredCPUBindPolicy == "" || preferredCPUBindPolicy == schedulingconfig.CPUBindPolicyDefault {
138+
preferredCPUBindPolicy = p.pluginArgs.DefaultCPUBindPolicy
139+
}
140+
if preferredCPUBindPolicy == schedulingconfig.CPUBindPolicyFullPCPUs ||
141+
preferredCPUBindPolicy == schedulingconfig.CPUBindPolicySpreadByPCPUs {
137142
requests, _ := resourceapi.PodRequestsAndLimits(pod)
138143
requestedCPU := requests.Cpu().MilliValue()
139144
if requestedCPU%1000 != 0 {
@@ -143,6 +148,7 @@ func (p *Plugin) PreFilter(ctx context.Context, cycleState *framework.CycleState
143148
if requestedCPU > 0 {
144149
state.skip = false
145150
state.resourceSpec = resourceSpec
151+
state.preferredCPUBindPolicy = preferredCPUBindPolicy
146152
state.numCPUsNeeded = int(requestedCPU / 1000)
147153
}
148154
}
@@ -226,11 +232,11 @@ func (p *Plugin) Score(ctx context.Context, cycleState *framework.CycleState, po
226232
return 0, nil
227233
}
228234

229-
score := p.calcScore(state.numCPUsNeeded, state.resourceSpec, numaInfo)
235+
score := p.calcScore(state.numCPUsNeeded, state.preferredCPUBindPolicy, numaInfo)
230236
return score, nil
231237
}
232238

233-
func (p *Plugin) calcScore(numCPUsNeeded int, resourceSpec *extension.ResourceSpec, numaInfo *nodeNUMAInfo) int64 {
239+
func (p *Plugin) calcScore(numCPUsNeeded int, preferredCPUBindPolicy extension.CPUBindPolicy, numaInfo *nodeNUMAInfo) int64 {
234240
availableCPUs, allocated := getAvailableCPUsFunc(numaInfo)
235241
acc := newCPUAccumulator(
236242
numaInfo.cpuTopology,
@@ -242,7 +248,7 @@ func (p *Plugin) calcScore(numCPUsNeeded int, resourceSpec *extension.ResourceSp
242248
)
243249

244250
var freeCPUs [][]int
245-
if resourceSpec.PreferredCPUBindPolicy == schedulingconfig.CPUBindPolicyFullPCPUs {
251+
if preferredCPUBindPolicy == schedulingconfig.CPUBindPolicyFullPCPUs {
246252
if numCPUsNeeded <= numaInfo.cpuTopology.CPUsPerNode() {
247253
freeCPUs = acc.freeCoresInNode(true)
248254
} else if numCPUsNeeded <= numaInfo.cpuTopology.CPUsPerSocket() {
@@ -347,7 +353,7 @@ func (p *Plugin) Reserve(ctx context.Context, cycleState *framework.CycleState,
347353
availableCPUs,
348354
allocated,
349355
state.numCPUsNeeded,
350-
state.resourceSpec.PreferredCPUBindPolicy,
356+
state.preferredCPUBindPolicy,
351357
false,
352358
p.pluginArgs.NUMAAllocateStrategy,
353359
)
@@ -410,6 +416,19 @@ func (p *Plugin) PreBind(ctx context.Context, cycleState *framework.CycleState,
410416
}
411417
pod.Annotations[extension.AnnotationResourceStatus] = string(data)
412418

419+
// Write back ResourceSpec annotation if LSR Pod hasn't specified CPUBindPolicy
420+
if state.resourceSpec.PreferredCPUBindPolicy == "" ||
421+
state.resourceSpec.PreferredCPUBindPolicy == schedulingconfig.CPUBindPolicyDefault {
422+
resourceSpec := &extension.ResourceSpec{
423+
PreferredCPUBindPolicy: p.pluginArgs.DefaultCPUBindPolicy,
424+
}
425+
data, err = json.Marshal(resourceSpec)
426+
if err != nil {
427+
return framework.NewStatus(framework.Error, err.Error())
428+
}
429+
pod.Annotations[extension.AnnotationResourceSpec] = string(data)
430+
}
431+
413432
patchBytes, err := generatePodPatch(podOriginal, pod)
414433
if err != nil {
415434
return framework.NewStatus(framework.Error, err.Error())

0 commit comments

Comments
 (0)