Skip to content

Commit 363200d

Browse files
committed
koord-scheduler: support Reservation allocate policy
Signed-off-by: Joseph <joseph.t.lee@outlook.com>
1 parent 9449d56 commit 363200d

File tree

20 files changed

+2485
-421
lines changed

20 files changed

+2485
-421
lines changed

apis/scheduling/v1alpha1/reservation_types.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,31 @@ type ReservationSpec struct {
6262
// +kubebuilder:default=true
6363
// +optional
6464
AllocateOnce *bool `json:"allocateOnce,omitempty"`
65+
// AllocatePolicy represents the allocation policy of reserved resources that Reservation expects.
66+
// +kubebuilder:validation:Enum=Aligned;Restricted
67+
// +optional
68+
AllocatePolicy ReservationAllocatePolicy `json:"allocatePolicy,omitempty"`
6569
}
6670

71+
type ReservationAllocatePolicy string
72+
73+
const (
74+
// ReservationAllocatePolicyDefault means that there is no restriction on the policy of reserved resources,
75+
// and allocated from the Reservation first, and if it is insufficient, it is allocated from the node.
76+
ReservationAllocatePolicyDefault ReservationAllocatePolicy = ""
77+
// ReservationAllocatePolicyAligned indicates that the Pod allocates resources from the Reservation first.
78+
// If the remaining resources of the Reservation are insufficient, it can be allocated from the node,
79+
// but it is required to strictly follow the resource specifications of the Pod.
80+
// This can be used to avoid the problem that a Pod uses multiple Reservations at the same time.
81+
ReservationAllocatePolicyAligned ReservationAllocatePolicy = "Aligned"
82+
// ReservationAllocatePolicyRestricted means that the resources
83+
// requested by the Pod overlap with the resources reserved by the Reservation,
84+
// then these intersection resources can only be allocated from the Reservation,
85+
// but resources declared in Pods but not reserved in Reservations can be allocated from Nodes.
86+
// ReservationAllocatePolicyRestricted includes the semantics of ReservationAllocatePolicyAligned.
87+
ReservationAllocatePolicyRestricted ReservationAllocatePolicy = "Restricted"
88+
)
89+
6790
// ReservationTemplateSpec describes the data a Reservation should have when created from a template
6891
type ReservationTemplateSpec struct {
6992
// Standard object's metadata.

config/crd/bases/scheduling.koordinator.sh_reservations.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,13 @@ spec:
5959
only available for the first owner who allocates successfully and
6060
are not allocatable to other owners anymore. Defaults to true.
6161
type: boolean
62+
allocatePolicy:
63+
description: AllocatePolicy represents the allocation policy of reserved
64+
resources that Reservation expects.
65+
enum:
66+
- Aligned
67+
- Restricted
68+
type: string
6269
expires:
6370
description: Expired timestamp when the reservation is expected to
6471
expire. If both `expires` and `ttl` are set, `expires` is checked

docs/proposals/scheduling/20220609-resource-reservation.md

Lines changed: 50 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ reviewers:
88
- "@jasonliu747"
99
- "@zwzhang0107"
1010
creation-date: 2022-06-09
11-
last-updated: 2023-05-09
11+
last-updated: 2023-05-18
1212
---
1313
# Resource Reservation
1414

@@ -32,6 +32,7 @@ last-updated: 2023-05-09
3232
- [Story 5](#story-5)
3333
- [API](#api)
3434
- [Reservation Affinity](#reservation-affinity)
35+
- [Reservation Allocate Policy](#reservation-allocate-policy)
3536
- [Implementation Details](#implementation-details)
3637
- [Schedule Reservations](#schedule-reservations)
3738
- [Allocate Reserved Resources](#allocate-reserved-resources)
@@ -156,9 +157,30 @@ type ReservationSpec struct {
156157
// and are not allocatable to other owners anymore. Defaults to true.
157158
// +kubebuilder:default=true
158159
// +optional
159-
AllocateOnce bool `json:"allocateOnce,omitempty"`
160+
AllocateOnce *bool `json:"allocateOnce,omitempty"`
161+
// AllocatePolicy represents the allocation policy of reserved resources that Reservation expects.
162+
// +kubebuilder:validation:Enum=Aligned;Restricted
163+
// +optional
164+
AllocatePolicy ReservationAllocatePolicy `json:"allocatePolicy,omitempty"`
160165
}
161166

167+
type ReservationAllocatePolicy string
168+
169+
const (
170+
// ReservationAllocatePolicyDefault means that there is no restriction on the policy of reserved resources,
171+
// and allocated from the Reservation first, and if it is insufficient, it is allocated from the node.
172+
ReservationAllocatePolicyDefault ReservationAllocatePolicy = ""
173+
// ReservationAllocatePolicyAligned indicates that the Pod allocates resources from the Reservation first.
174+
// If the remaining resources of the Reservation are insufficient, it can be allocated from the node,
175+
// but it is required to strictly follow the resource specifications of the Pod.
176+
// This can be used to avoid the problem that a Pod uses multiple Reservations at the same time.
177+
ReservationAllocatePolicyAligned ReservationAllocatePolicy = "Aligned"
178+
// ReservationAllocatePolicyRestricted means that the resources requested by the Pod overlap with the resources reserved by the Reservation,
179+
// then these intersection resources can only be allocated from the Reservation, but other resources can be allocated from the node.
180+
// ReservationAllocatePolicyRestricted includes the semantics of ReservationAllocatePolicyAligned.
181+
ReservationAllocatePolicyRestricted ReservationAllocatePolicy = "Restricted"
182+
)
183+
162184
// ReservationTemplateSpec describes the data a Reservation should have when created from a template
163185
type ReservationTemplateSpec struct {
164186
// Standard object's metadata.
@@ -272,27 +294,40 @@ The Annotation key is `scheduling.koordinator.sh/reservation-affinity`, and the
272294
```go
273295
// ReservationAffinity represents the constraints of Pod selection Reservation
274296
type ReservationAffinity struct {
275-
// If the affinity requirements specified by this field are not met at
276-
// scheduling time, the pod will not be scheduled onto the node.
277-
// If the affinity requirements specified by this field cease to be met
278-
// at some point during pod execution (e.g. due to an update), the system
279-
// may or may not try to eventually evict the pod from its node.
280-
RequiredDuringSchedulingIgnoredDuringExecution *ReservationAffinitySelector `json:"requiredDuringSchedulingIgnoredDuringExecution,omitempty"`
281-
// ReservationSelector is a selector which must be true for the pod to fit on a reservation.
282-
// Selector which must match a reservation's labels for the pod to be scheduled on that node.
283-
ReservationSelector map[string]string `json:"reservationSelector,omitempty"`
297+
// If the affinity requirements specified by this field are not met at
298+
// scheduling time, the pod will not be scheduled onto the node.
299+
// If the affinity requirements specified by this field cease to be met
300+
// at some point during pod execution (e.g. due to an update), the system
301+
// may or may not try to eventually evict the pod from its node.
302+
RequiredDuringSchedulingIgnoredDuringExecution *ReservationAffinitySelector `json:"requiredDuringSchedulingIgnoredDuringExecution,omitempty"`
303+
// ReservationSelector is a selector which must be true for the pod to fit on a reservation.
304+
// Selector which must match a reservation's labels for the pod to be scheduled on that node.
305+
ReservationSelector map[string]string `json:"reservationSelector,omitempty"`
284306
}
285307

286308
// ReservationAffinitySelector represents the union of the results of one or more label queries
287309
// over a set of reservations; that is, it represents the OR of the selectors represented
288310
// by the reservation selector terms.
289311
type ReservationAffinitySelector struct {
290-
// Required. A list of reservation selector terms. The terms are ORed.
291-
// Reuse corev1.NodeSelectorTerm to avoid defining too many repeated definitions.
292-
ReservationSelectorTerms []corev1.NodeSelectorTerm `json:"reservationSelectorTerms,omitempty"`
312+
// Required. A list of reservation selector terms. The terms are ORed.
313+
// Reuse corev1.NodeSelectorTerm to avoid defining too many repeated definitions.
314+
ReservationSelectorTerms []corev1.NodeSelectorTerm `json:"reservationSelectorTerms,omitempty"`
293315
}
294316
```
295317

318+
#### Reservation Allocate Policy
319+
320+
When the scheduler allocates resources reserved by a Reservation to a Pod, it is possible to jointly allocate resources from multiple Reservations. For example, Reservation A reserves 4000m of CPU, and Reservation B reserves 8Gi of memory. Both A and B can be used by Pod 1, so Pod 1 will use the resources of Reservation A and B at the same time during scheduling. Although the scheduler can ensure that the overall resources of the node will not be oversold, it will still be confusing from the perspective of Reservation, and although this scenario is rare, it has not considered abolishing support for this scenario.
321+
But at the same time, taking into account the resource boundaries between Reservations, the `AllocatePolicy` field is newly added to support `Aligned` and `Restricted` policies.
322+
323+
- `Aligned` indicates that the Pod allocates resources from the Reservation first. If the remaining resources of the Reservation are insufficient, it can be allocated from the node, but it is required to strictly follow the resource specifications of the Pod. This can be used to avoid the problem that a Pod uses multiple Reservations at the same time.
324+
325+
- `Restricted` means that the resources requested by the Pod overlap with the resources reserved by the Reservation, then these intersection resources can only be allocated from the Reservation, but other resources can be allocated from the node. Restricted includes the semantics of ReservationAllocatePolicyAligned.
326+
327+
If a node has multiple Reservations with Aligned or Restricted policies, and these Reservations cannot satisfy the pod’s request at all during filtering, the node will be filtered out.
328+
329+
The default policy (currently is none) cannot coexist with the policies Aligned and Restricted.
330+
296331
### Implementation Details
297332

298333
#### Schedule Reservations
@@ -452,6 +487,7 @@ Reserving resources with [`pause` pods with very low assigned priority](https://
452487
- [X] 08/08/2022: Update allocateOnce API
453488
- [X] 25/03/2023: Update the API and E2E Tests
454489
- [X] 09/05/2023: Add Reservation Affinity API
490+
- [X] 18/05/2023: Add Reservation Allocate Policy API
455491

456492
## References
457493

pkg/scheduler/plugins/deviceshare/allocator.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ type AllocatorFactoryFn func(options AllocatorOptions) Allocator
4141

4242
type Allocator interface {
4343
Name() string
44-
Allocate(nodeName string, pod *corev1.Pod, podRequest corev1.ResourceList, nodeDevice *nodeDevice, required, preferred map[schedulingv1alpha1.DeviceType]sets.Int, preemptibleFreeDevices map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error)
44+
Allocate(nodeName string, pod *corev1.Pod, podRequest corev1.ResourceList, nodeDevice *nodeDevice, required, preferred map[schedulingv1alpha1.DeviceType]sets.Int, requiredDevices, preemptibleDevices map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error)
4545
Reserve(pod *corev1.Pod, nodeDevice *nodeDevice, allocations apiext.DeviceAllocations)
4646
Unreserve(pod *corev1.Pod, nodeDevice *nodeDevice, allocations apiext.DeviceAllocations)
4747
}
@@ -70,8 +70,8 @@ func (a *defaultAllocator) Name() string {
7070
return defaultAllocatorName
7171
}
7272

73-
func (a *defaultAllocator) Allocate(nodeName string, pod *corev1.Pod, podRequest corev1.ResourceList, nodeDevice *nodeDevice, required, preferred map[schedulingv1alpha1.DeviceType]sets.Int, preemptibleFreeDevices map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error) {
74-
return nodeDevice.tryAllocateDevice(podRequest, required, preferred, preemptibleFreeDevices)
73+
func (a *defaultAllocator) Allocate(nodeName string, pod *corev1.Pod, podRequest corev1.ResourceList, nodeDevice *nodeDevice, required, preferred map[schedulingv1alpha1.DeviceType]sets.Int, requiredDeviceResources, preemptibleDeviceResources map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error) {
74+
return nodeDevice.tryAllocateDevice(podRequest, required, preferred, requiredDeviceResources, preemptibleDeviceResources)
7575
}
7676

7777
func (a *defaultAllocator) Reserve(pod *corev1.Pod, nodeDevice *nodeDevice, allocations apiext.DeviceAllocations) {

pkg/scheduler/plugins/deviceshare/device_cache.go

Lines changed: 58 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ func (n *nodeDevice) updateAllocateSet(deviceType schedulingv1alpha1.DeviceType,
259259
}
260260
}
261261

262-
func (n *nodeDevice) tryAllocateDevice(podRequest corev1.ResourceList, requiredDevices, preferredDevices map[schedulingv1alpha1.DeviceType]sets.Int, preemptibleDevices map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error) {
262+
func (n *nodeDevice) tryAllocateDevice(podRequest corev1.ResourceList, required, preferred map[schedulingv1alpha1.DeviceType]sets.Int, requiredDeviceResources, preemptibleDeviceResources map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error) {
263263
allocateResult := make(apiext.DeviceAllocations)
264264

265265
for deviceType := range DeviceResourceNames {
@@ -268,7 +268,16 @@ func (n *nodeDevice) tryAllocateDevice(podRequest corev1.ResourceList, requiredD
268268
if !hasDeviceResource(podRequest, deviceType) {
269269
break
270270
}
271-
if err := n.tryAllocateDeviceByType(podRequest, deviceType, requiredDevices[deviceType], preferredDevices[deviceType], allocateResult, preemptibleDevices); err != nil {
271+
err := n.tryAllocateDeviceByType(
272+
podRequest,
273+
deviceType,
274+
required[deviceType],
275+
preferred[deviceType],
276+
allocateResult,
277+
requiredDeviceResources[deviceType],
278+
preemptibleDeviceResources[deviceType],
279+
)
280+
if err != nil {
272281
return nil, err
273282
}
274283
default:
@@ -279,40 +288,26 @@ func (n *nodeDevice) tryAllocateDevice(podRequest corev1.ResourceList, requiredD
279288
return allocateResult, nil
280289
}
281290

282-
func (n *nodeDevice) tryAllocateDeviceByType(podRequest corev1.ResourceList, deviceType schedulingv1alpha1.DeviceType, required, preferred sets.Int, allocateResult apiext.DeviceAllocations, preemptibleDevices map[schedulingv1alpha1.DeviceType]deviceResources) error {
291+
func (n *nodeDevice) tryAllocateDeviceByType(
292+
podRequest corev1.ResourceList,
293+
deviceType schedulingv1alpha1.DeviceType,
294+
required sets.Int,
295+
preferred sets.Int,
296+
allocateResult apiext.DeviceAllocations,
297+
requiredDeviceResources deviceResources,
298+
preemptibleDeviceResources deviceResources,
299+
) error {
283300
podRequest = quotav1.Mask(podRequest, DeviceResourceNames[deviceType])
284301
nodeDeviceTotal := n.deviceTotal[deviceType]
285302
if len(nodeDeviceTotal) == 0 {
286303
return fmt.Errorf("node does not have enough %v", deviceType)
287304
}
288305

289-
// freeDevices is the rest of the whole machine, or is the rest of the reservation
290-
freeDevices := n.deviceFree[deviceType]
291-
deviceUsed := n.deviceUsed[deviceType]
292-
// preemptible represent preemptible devices, which may be a complete device instance or part of an instance's resources
293-
preemptible := preemptibleDevices[deviceType]
294-
var mergedFreeDevices deviceResources
295-
if len(preemptible) > 0 {
296-
mergedFreeDevices = make(deviceResources)
297-
for minor, v := range preemptible {
298-
used := quotav1.SubtractWithNonNegativeResult(deviceUsed[minor], v)
299-
remaining := quotav1.SubtractWithNonNegativeResult(nodeDeviceTotal[minor], used)
300-
if !quotav1.IsZero(remaining) {
301-
mergedFreeDevices[minor] = remaining
302-
}
303-
}
304-
}
305-
306-
// The merging logic is executed only when there is a device that can be preempted,
307-
// and the remaining idle devices are merged together to participate in the allocation
308-
if len(mergedFreeDevices) > 0 {
309-
for minor, v := range freeDevices {
310-
res := mergedFreeDevices[minor]
311-
if res == nil {
312-
mergedFreeDevices[minor] = v.DeepCopy()
313-
}
314-
}
315-
freeDevices = mergedFreeDevices
306+
var freeDevices deviceResources
307+
if len(requiredDeviceResources) > 0 {
308+
freeDevices = requiredDeviceResources
309+
} else {
310+
freeDevices = n.calcFreeWithPreemptible(deviceType, preemptibleDeviceResources)
316311
}
317312

318313
if deviceType == schedulingv1alpha1.GPU {
@@ -352,11 +347,11 @@ func (n *nodeDevice) tryAllocateDeviceByType(podRequest corev1.ResourceList, dev
352347
satisfiedDeviceCount := 0
353348
orderedDeviceResources := sortDeviceResourcesByMinor(freeDevices, preferred)
354349
for _, deviceResource := range orderedDeviceResources {
355-
// Skip unhealthy Device instances with zero resources
356-
if quotav1.IsZero(deviceResource.resources) {
350+
if required.Len() > 0 && !required.Has(deviceResource.minor) {
357351
continue
358352
}
359-
if len(required) > 0 && !required.Has(deviceResource.minor) {
353+
// Skip unhealthy Device instances with zero resources
354+
if quotav1.IsZero(deviceResource.resources) {
360355
continue
361356
}
362357
if satisfied, _ := quotav1.LessThanOrEqual(podRequestPerCard, deviceResource.resources); satisfied {
@@ -375,6 +370,36 @@ func (n *nodeDevice) tryAllocateDeviceByType(podRequest corev1.ResourceList, dev
375370
return fmt.Errorf("node does not have enough %v", deviceType)
376371
}
377372

373+
func (n *nodeDevice) calcFreeWithPreemptible(deviceType schedulingv1alpha1.DeviceType, preemptible deviceResources) deviceResources {
374+
deviceFree := n.deviceFree[deviceType]
375+
deviceUsed := n.deviceUsed[deviceType]
376+
deviceTotal := n.deviceTotal[deviceType]
377+
var mergedFreeDevices deviceResources
378+
if len(preemptible) > 0 {
379+
mergedFreeDevices = make(deviceResources)
380+
for minor, v := range preemptible {
381+
used := quotav1.SubtractWithNonNegativeResult(deviceUsed[minor], v)
382+
remaining := quotav1.SubtractWithNonNegativeResult(deviceTotal[minor], used)
383+
if !quotav1.IsZero(remaining) {
384+
mergedFreeDevices[minor] = remaining
385+
}
386+
}
387+
}
388+
389+
// The merging logic is executed only when there is a device that can be preempted,
390+
// and the remaining idle devices are merged together to participate in the allocation
391+
if len(mergedFreeDevices) > 0 {
392+
for minor, v := range deviceFree {
393+
res := mergedFreeDevices[minor]
394+
if res == nil {
395+
mergedFreeDevices[minor] = v.DeepCopy()
396+
}
397+
}
398+
deviceFree = mergedFreeDevices
399+
}
400+
return deviceFree
401+
}
402+
378403
type nodeDeviceCache struct {
379404
lock sync.Mutex
380405
// nodeDeviceInfos stores nodeDevice for each node

0 commit comments

Comments
 (0)