koordinator-sh
diff --git a/‎apis/scheduling/v1alpha1/reservation_types.go‎
Lines changed: 23 additions & 0 deletions b/‎apis/scheduling/v1alpha1/reservation_types.go‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎config/crd/bases/scheduling.koordinator.sh_reservations.yaml‎
Lines changed: 7 additions & 0 deletions b/‎config/crd/bases/scheduling.koordinator.sh_reservations.yaml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎docs/proposals/scheduling/20220609-resource-reservation.md‎
Lines changed: 50 additions & 14 deletions b/‎docs/proposals/scheduling/20220609-resource-reservation.md‎
Lines changed: 50 additions & 14 deletions
diff --git a/‎pkg/scheduler/plugins/deviceshare/allocator.go‎
Lines changed: 3 additions & 3 deletions b/‎pkg/scheduler/plugins/deviceshare/allocator.go‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pkg/scheduler/plugins/deviceshare/device_cache.go‎
Lines changed: 58 additions & 33 deletions b/‎pkg/scheduler/plugins/deviceshare/device_cache.go‎
Lines changed: 58 additions & 33 deletions
@@ -62,8 +62,31 @@ type ReservationSpec struct {
 	// +kubebuilder:default=true
 	// +optional
 	AllocateOnce *bool `json:"allocateOnce,omitempty"`
+	// AllocatePolicy represents the allocation policy of reserved resources that Reservation expects.
+	// +kubebuilder:validation:Enum=Aligned;Restricted
+	// +optional
+	AllocatePolicy ReservationAllocatePolicy `json:"allocatePolicy,omitempty"`
 }
 
+type ReservationAllocatePolicy string
+
+const (
+	// ReservationAllocatePolicyDefault means that there is no restriction on the policy of reserved resources,
+	// and allocated from the Reservation first, and if it is insufficient, it is allocated from the node.
+	ReservationAllocatePolicyDefault ReservationAllocatePolicy = ""
+	// ReservationAllocatePolicyAligned indicates that the Pod allocates resources from the Reservation first.
+	// If the remaining resources of the Reservation are insufficient, it can be allocated from the node,
+	// but it is required to strictly follow the resource specifications of the Pod.
+	// This can be used to avoid the problem that a Pod uses multiple Reservations at the same time.
+	ReservationAllocatePolicyAligned ReservationAllocatePolicy = "Aligned"
+	// ReservationAllocatePolicyRestricted means that the resources
+	// requested by the Pod overlap with the resources reserved by the Reservation,
+	// then these intersection resources can only be allocated from the Reservation,
+	// but resources declared in Pods but not reserved in Reservations can be allocated from Nodes.
+	// ReservationAllocatePolicyRestricted includes the semantics of ReservationAllocatePolicyAligned.
+	ReservationAllocatePolicyRestricted ReservationAllocatePolicy = "Restricted"
+)
+
 // ReservationTemplateSpec describes the data a Reservation should have when created from a template
 type ReservationTemplateSpec struct {
 	// Standard object's metadata.
 
@@ -59,6 +59,13 @@ spec:
                   only available for the first owner who allocates successfully and
                   are not allocatable to other owners anymore. Defaults to true.
                 type: boolean
+              allocatePolicy:
+                description: AllocatePolicy represents the allocation policy of reserved
+                  resources that Reservation expects.
+                enum:
+                - Aligned
+                - Restricted
+                type: string
               expires:
                 description: Expired timestamp when the reservation is expected to
                   expire. If both `expires` and `ttl` are set, `expires` is checked
 
@@ -8,7 +8,7 @@ reviewers:
   - "@jasonliu747"
   - "@zwzhang0107"
 creation-date: 2022-06-09
-last-updated: 2023-05-09
+last-updated: 2023-05-18
 ---
 # Resource Reservation
 
@@ -32,6 +32,7 @@ last-updated: 2023-05-09
             - [Story 5](#story-5)
         - [API](#api)
             - [Reservation Affinity](#reservation-affinity)
+            - [Reservation Allocate Policy](#reservation-allocate-policy)
         - [Implementation Details](#implementation-details)
             - [Schedule Reservations](#schedule-reservations)
             - [Allocate Reserved Resources](#allocate-reserved-resources)
@@ -156,9 +157,30 @@ type ReservationSpec struct {
     // and are not allocatable to other owners anymore. Defaults to true.
     // +kubebuilder:default=true
     // +optional
-    AllocateOnce bool `json:"allocateOnce,omitempty"`
+    AllocateOnce *bool `json:"allocateOnce,omitempty"`
+    // AllocatePolicy represents the allocation policy of reserved resources that Reservation expects.
+    // +kubebuilder:validation:Enum=Aligned;Restricted
+    // +optional
+    AllocatePolicy ReservationAllocatePolicy `json:"allocatePolicy,omitempty"`
 }
 
+type ReservationAllocatePolicy string
+
+const (
+    // ReservationAllocatePolicyDefault means that there is no restriction on the policy of reserved resources,
+    // and allocated from the Reservation first, and if it is insufficient, it is allocated from the node.
+    ReservationAllocatePolicyDefault ReservationAllocatePolicy = ""
+    // ReservationAllocatePolicyAligned indicates that the Pod allocates resources from the Reservation first.
+    // If the remaining resources of the Reservation are insufficient, it can be allocated from the node,
+    // but it is required to strictly follow the resource specifications of the Pod.
+    // This can be used to avoid the problem that a Pod uses multiple Reservations at the same time.
+    ReservationAllocatePolicyAligned ReservationAllocatePolicy = "Aligned"
+    // ReservationAllocatePolicyRestricted means that the resources requested by the Pod overlap with the resources reserved by the Reservation,
+    // then these intersection resources can only be allocated from the Reservation, but other resources can be allocated from the node.
+    // ReservationAllocatePolicyRestricted includes the semantics of ReservationAllocatePolicyAligned.
+    ReservationAllocatePolicyRestricted ReservationAllocatePolicy = "Restricted"
+)
+
 // ReservationTemplateSpec describes the data a Reservation should have when created from a template
 type ReservationTemplateSpec struct {
     // Standard object's metadata.
@@ -272,27 +294,40 @@ The Annotation key is `scheduling.koordinator.sh/reservation-affinity`, and the
 ```go
 // ReservationAffinity represents the constraints of Pod selection Reservation
 type ReservationAffinity struct {
-	// If the affinity requirements specified by this field are not met at
-	// scheduling time, the pod will not be scheduled onto the node.
-	// If the affinity requirements specified by this field cease to be met
-	// at some point during pod execution (e.g. due to an update), the system
-	// may or may not try to eventually evict the pod from its node.
-	RequiredDuringSchedulingIgnoredDuringExecution *ReservationAffinitySelector `json:"requiredDuringSchedulingIgnoredDuringExecution,omitempty"`
-	// ReservationSelector is a selector which must be true for the pod to fit on a reservation.
-	// Selector which must match a reservation's labels for the pod to be scheduled on that node.
-	ReservationSelector map[string]string `json:"reservationSelector,omitempty"`
+    // If the affinity requirements specified by this field are not met at
+    // scheduling time, the pod will not be scheduled onto the node.
+    // If the affinity requirements specified by this field cease to be met
+    // at some point during pod execution (e.g. due to an update), the system
+    // may or may not try to eventually evict the pod from its node.
+    RequiredDuringSchedulingIgnoredDuringExecution *ReservationAffinitySelector `json:"requiredDuringSchedulingIgnoredDuringExecution,omitempty"`
+    // ReservationSelector is a selector which must be true for the pod to fit on a reservation.
+    // Selector which must match a reservation's labels for the pod to be scheduled on that node.
+    ReservationSelector map[string]string `json:"reservationSelector,omitempty"`
 }
 
 // ReservationAffinitySelector represents the union of the results of one or more label queries
 // over a set of reservations; that is, it represents the OR of the selectors represented
 // by the reservation selector terms.
 type ReservationAffinitySelector struct {
-	// Required. A list of reservation selector terms. The terms are ORed.
-	// Reuse corev1.NodeSelectorTerm to avoid defining too many repeated definitions.
-	ReservationSelectorTerms []corev1.NodeSelectorTerm `json:"reservationSelectorTerms,omitempty"`
+    // Required. A list of reservation selector terms. The terms are ORed.
+    // Reuse corev1.NodeSelectorTerm to avoid defining too many repeated definitions.
+    ReservationSelectorTerms []corev1.NodeSelectorTerm `json:"reservationSelectorTerms,omitempty"`
 }
 ```
 
+#### Reservation Allocate Policy
+
+When the scheduler allocates resources reserved by a Reservation to a Pod, it is possible to jointly allocate resources from multiple Reservations. For example, Reservation A reserves 4000m of CPU, and Reservation B reserves 8Gi of memory. Both A and B can be used by Pod 1, so Pod 1 will use the resources of Reservation A and B at the same time during scheduling. Although the scheduler can ensure that the overall resources of the node will not be oversold, it will still be confusing from the perspective of Reservation, and although this scenario is rare, it has not considered abolishing support for this scenario.
+But at the same time, taking into account the resource boundaries between Reservations, the `AllocatePolicy` field is newly added to support `Aligned` and `Restricted` policies.
+
+- `Aligned` indicates that the Pod allocates resources from the Reservation first. If the remaining resources of the Reservation are insufficient, it can be allocated from the node, but it is required to strictly follow the resource specifications of the Pod. This can be used to avoid the problem that a Pod uses multiple Reservations at the same time.
+
+- `Restricted` means that the resources requested by the Pod overlap with the resources reserved by the Reservation, then these intersection resources can only be allocated from the Reservation, but other resources can be allocated from the node. Restricted includes the semantics of ReservationAllocatePolicyAligned.
+
+If a node has multiple Reservations with Aligned or Restricted policies, and these Reservations cannot satisfy the pod’s request at all during filtering, the node will be filtered out.
+
+The default policy (currently is none) cannot coexist with the policies Aligned and Restricted.
+
 ### Implementation Details
 
 #### Schedule Reservations
@@ -452,6 +487,7 @@ Reserving resources with [`pause` pods with very low assigned priority](https://
 - [X]  08/08/2022: Update allocateOnce API
 - [X]  25/03/2023: Update the API and E2E Tests
 - [X]  09/05/2023: Add Reservation Affinity API
+- [X]  18/05/2023: Add Reservation Allocate Policy API
 
 ## References
 
 
@@ -41,7 +41,7 @@ type AllocatorFactoryFn func(options AllocatorOptions) Allocator
 
 type Allocator interface {
 	Name() string
-	Allocate(nodeName string, pod *corev1.Pod, podRequest corev1.ResourceList, nodeDevice *nodeDevice, required, preferred map[schedulingv1alpha1.DeviceType]sets.Int, preemptibleFreeDevices map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error)
+	Allocate(nodeName string, pod *corev1.Pod, podRequest corev1.ResourceList, nodeDevice *nodeDevice, required, preferred map[schedulingv1alpha1.DeviceType]sets.Int, requiredDevices, preemptibleDevices map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error)
 	Reserve(pod *corev1.Pod, nodeDevice *nodeDevice, allocations apiext.DeviceAllocations)
 	Unreserve(pod *corev1.Pod, nodeDevice *nodeDevice, allocations apiext.DeviceAllocations)
 }
@@ -70,8 +70,8 @@ func (a *defaultAllocator) Name() string {
 	return defaultAllocatorName
 }
 
-func (a *defaultAllocator) Allocate(nodeName string, pod *corev1.Pod, podRequest corev1.ResourceList, nodeDevice *nodeDevice, required, preferred map[schedulingv1alpha1.DeviceType]sets.Int, preemptibleFreeDevices map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error) {
-	return nodeDevice.tryAllocateDevice(podRequest, required, preferred, preemptibleFreeDevices)
+func (a *defaultAllocator) Allocate(nodeName string, pod *corev1.Pod, podRequest corev1.ResourceList, nodeDevice *nodeDevice, required, preferred map[schedulingv1alpha1.DeviceType]sets.Int, requiredDeviceResources, preemptibleDeviceResources map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error) {
+	return nodeDevice.tryAllocateDevice(podRequest, required, preferred, requiredDeviceResources, preemptibleDeviceResources)
 }
 
 func (a *defaultAllocator) Reserve(pod *corev1.Pod, nodeDevice *nodeDevice, allocations apiext.DeviceAllocations) {
 
@@ -259,7 +259,7 @@ func (n *nodeDevice) updateAllocateSet(deviceType schedulingv1alpha1.DeviceType,
 	}
 }
 
-func (n *nodeDevice) tryAllocateDevice(podRequest corev1.ResourceList, requiredDevices, preferredDevices map[schedulingv1alpha1.DeviceType]sets.Int, preemptibleDevices map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error) {
+func (n *nodeDevice) tryAllocateDevice(podRequest corev1.ResourceList, required, preferred map[schedulingv1alpha1.DeviceType]sets.Int, requiredDeviceResources, preemptibleDeviceResources map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error) {
 	allocateResult := make(apiext.DeviceAllocations)
 
 	for deviceType := range DeviceResourceNames {
@@ -268,7 +268,16 @@ func (n *nodeDevice) tryAllocateDevice(podRequest corev1.ResourceList, requiredD
 			if !hasDeviceResource(podRequest, deviceType) {
 				break
 			}
-			if err := n.tryAllocateDeviceByType(podRequest, deviceType, requiredDevices[deviceType], preferredDevices[deviceType], allocateResult, preemptibleDevices); err != nil {
+			err := n.tryAllocateDeviceByType(
+				podRequest,
+				deviceType,
+				required[deviceType],
+				preferred[deviceType],
+				allocateResult,
+				requiredDeviceResources[deviceType],
+				preemptibleDeviceResources[deviceType],
+			)
+			if err != nil {
 				return nil, err
 			}
 		default:
@@ -279,40 +288,26 @@ func (n *nodeDevice) tryAllocateDevice(podRequest corev1.ResourceList, requiredD
 	return allocateResult, nil
 }
 
-func (n *nodeDevice) tryAllocateDeviceByType(podRequest corev1.ResourceList, deviceType schedulingv1alpha1.DeviceType, required, preferred sets.Int, allocateResult apiext.DeviceAllocations, preemptibleDevices map[schedulingv1alpha1.DeviceType]deviceResources) error {
+func (n *nodeDevice) tryAllocateDeviceByType(
+	podRequest corev1.ResourceList,
+	deviceType schedulingv1alpha1.DeviceType,
+	required sets.Int,
+	preferred sets.Int,
+	allocateResult apiext.DeviceAllocations,
+	requiredDeviceResources deviceResources,
+	preemptibleDeviceResources deviceResources,
+) error {
 	podRequest = quotav1.Mask(podRequest, DeviceResourceNames[deviceType])
 	nodeDeviceTotal := n.deviceTotal[deviceType]
 	if len(nodeDeviceTotal) == 0 {
 		return fmt.Errorf("node does not have enough %v", deviceType)
 	}
 
-	// freeDevices is the rest of the whole machine, or is the rest of the reservation
-	freeDevices := n.deviceFree[deviceType]
-	deviceUsed := n.deviceUsed[deviceType]
-	// preemptible represent preemptible devices, which may be a complete device instance or part of an instance's resources
-	preemptible := preemptibleDevices[deviceType]
-	var mergedFreeDevices deviceResources
-	if len(preemptible) > 0 {
-		mergedFreeDevices = make(deviceResources)
-		for minor, v := range preemptible {
-			used := quotav1.SubtractWithNonNegativeResult(deviceUsed[minor], v)
-			remaining := quotav1.SubtractWithNonNegativeResult(nodeDeviceTotal[minor], used)
-			if !quotav1.IsZero(remaining) {
-				mergedFreeDevices[minor] = remaining
-			}
-		}
-	}
-
-	// The merging logic is executed only when there is a device that can be preempted,
-	// and the remaining idle devices are merged together to participate in the allocation
-	if len(mergedFreeDevices) > 0 {
-		for minor, v := range freeDevices {
-			res := mergedFreeDevices[minor]
-			if res == nil {
-				mergedFreeDevices[minor] = v.DeepCopy()
-			}
-		}
-		freeDevices = mergedFreeDevices
+	var freeDevices deviceResources
+	if len(requiredDeviceResources) > 0 {
+		freeDevices = requiredDeviceResources
+	} else {
+		freeDevices = n.calcFreeWithPreemptible(deviceType, preemptibleDeviceResources)
 	}
 
 	if deviceType == schedulingv1alpha1.GPU {
@@ -352,11 +347,11 @@ func (n *nodeDevice) tryAllocateDeviceByType(podRequest corev1.ResourceList, dev
 	satisfiedDeviceCount := 0
 	orderedDeviceResources := sortDeviceResourcesByMinor(freeDevices, preferred)
 	for _, deviceResource := range orderedDeviceResources {
-		// Skip unhealthy Device instances with zero resources
-		if quotav1.IsZero(deviceResource.resources) {
+		if required.Len() > 0 && !required.Has(deviceResource.minor) {
 			continue
 		}
-		if len(required) > 0 && !required.Has(deviceResource.minor) {
+		// Skip unhealthy Device instances with zero resources
+		if quotav1.IsZero(deviceResource.resources) {
 			continue
 		}
 		if satisfied, _ := quotav1.LessThanOrEqual(podRequestPerCard, deviceResource.resources); satisfied {
@@ -375,6 +370,36 @@ func (n *nodeDevice) tryAllocateDeviceByType(podRequest corev1.ResourceList, dev
 	return fmt.Errorf("node does not have enough %v", deviceType)
 }
 
+func (n *nodeDevice) calcFreeWithPreemptible(deviceType schedulingv1alpha1.DeviceType, preemptible deviceResources) deviceResources {
+	deviceFree := n.deviceFree[deviceType]
+	deviceUsed := n.deviceUsed[deviceType]
+	deviceTotal := n.deviceTotal[deviceType]
+	var mergedFreeDevices deviceResources
+	if len(preemptible) > 0 {
+		mergedFreeDevices = make(deviceResources)
+		for minor, v := range preemptible {
+			used := quotav1.SubtractWithNonNegativeResult(deviceUsed[minor], v)
+			remaining := quotav1.SubtractWithNonNegativeResult(deviceTotal[minor], used)
+			if !quotav1.IsZero(remaining) {
+				mergedFreeDevices[minor] = remaining
+			}
+		}
+	}
+
+	// The merging logic is executed only when there is a device that can be preempted,
+	// and the remaining idle devices are merged together to participate in the allocation
+	if len(mergedFreeDevices) > 0 {
+		for minor, v := range deviceFree {
+			res := mergedFreeDevices[minor]
+			if res == nil {
+				mergedFreeDevices[minor] = v.DeepCopy()
+			}
+		}
+		deviceFree = mergedFreeDevices
+	}
+	return deviceFree
+}
+
 type nodeDeviceCache struct {
 	lock sync.Mutex
 	// nodeDeviceInfos stores nodeDevice for each node
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ type AllocatorFactoryFn func(options AllocatorOptions) Allocator`
`41`	`41`
`42`	`42`	`type Allocator interface {`
`43`	`43`	`Name() string`
`44`		`- Allocate(nodeName string, pod corev1.Pod, podRequest corev1.ResourceList, nodeDevice nodeDevice, required, preferred map[schedulingv1alpha1.DeviceType]sets.Int, preemptibleFreeDevices map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error)`
	`44`	`+ Allocate(nodeName string, pod corev1.Pod, podRequest corev1.ResourceList, nodeDevice nodeDevice, required, preferred map[schedulingv1alpha1.DeviceType]sets.Int, requiredDevices, preemptibleDevices map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error)`
`45`	`45`	`Reserve(pod corev1.Pod, nodeDevice nodeDevice, allocations apiext.DeviceAllocations)`
`46`	`46`	`Unreserve(pod corev1.Pod, nodeDevice nodeDevice, allocations apiext.DeviceAllocations)`
`47`	`47`	`}`
`@@ -70,8 +70,8 @@ func (a *defaultAllocator) Name() string {`
`70`	`70`	`return defaultAllocatorName`
`71`	`71`	`}`
`72`	`72`
`73`		`-func (a defaultAllocator) Allocate(nodeName string, pod corev1.Pod, podRequest corev1.ResourceList, nodeDevice *nodeDevice, required, preferred map[schedulingv1alpha1.DeviceType]sets.Int, preemptibleFreeDevices map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error) {`
`74`		`- return nodeDevice.tryAllocateDevice(podRequest, required, preferred, preemptibleFreeDevices)`
	`73`	`+func (a defaultAllocator) Allocate(nodeName string, pod corev1.Pod, podRequest corev1.ResourceList, nodeDevice *nodeDevice, required, preferred map[schedulingv1alpha1.DeviceType]sets.Int, requiredDeviceResources, preemptibleDeviceResources map[schedulingv1alpha1.DeviceType]deviceResources) (apiext.DeviceAllocations, error) {`
	`74`	`+ return nodeDevice.tryAllocateDevice(podRequest, required, preferred, requiredDeviceResources, preemptibleDeviceResources)`
`75`	`75`	`}`
`76`	`76`
`77`	`77`	`func (a defaultAllocator) Reserve(pod corev1.Pod, nodeDevice *nodeDevice, allocations apiext.DeviceAllocations) {`