[data] fix resource allocator not respecting max resource requirement (ray-project#59412)

raulchen · peterxcli · commit b38c13afe6c5 · 2026-02-25T15:56:00.000+08:00
This PR adds a cap on the total budget allocation by max_resource_usage
in ReservationOpResourceAllocator.

Previously, the reservation was capped by max_resource_usage, but the
total budget (reservation + shared resources) could exceed the max. This
could lead to operators being allocated more resources than they can
use, while other operators are starved.

Changes

  Cap total allocation by max_resource_usage:
  - Total allocation = max(total_reserved, op_usage) + op_shared
  - We now cap op_shared so that total allocation ≤ max_resource_usage
- Excess shared resources remain in remaining_shared for other operators

  Redistribute remaining shared resources:
- After the allocation loop, any remaining shared resources (from caps)
are given to the most downstream uncapped operator
- An operator is "uncapped" if its max_resource_usage ==
ExecutionResources.inf()
- If all operators are capped, remaining resources are not redistributed

  Tests

- test_budget_capped_by_max_resource_usage: Tests that capped operators
don't receive excess shared resources, and remaining goes to uncapped
downstream op
- test_budget_capped_by_max_resource_usage_all_capped: Tests that when
all operators are capped, remaining shared resources are not
redistributed

---------

Signed-off-by: Hao Chen &lt;chenh1024@gmail.com&gt;
Signed-off-by: peterxcli &lt;peterxcli@gmail.com&gt;
diff --git a/python/ray/data/_internal/execution/resource_manager.py b/python/ray/data/_internal/execution/resource_manager.py
@@ -789,6 +789,14 @@ def get_allocation(self, op: PhysicalOperator) -> Optional[ExecutionResources]:
         # TODO fix
         return ExecutionResources.zero()
 
+    def _get_total_reserved(self, op: PhysicalOperator) -> ExecutionResources:
+        """Get total reserved resources for an operator, including outputs reservation."""
+        op_reserved = self._op_reserved[op]
+        reserved_for_outputs = self._reserved_for_op_outputs[op]
+        return op_reserved.copy(
+            object_store_memory=op_reserved.object_store_memory + reserved_for_outputs
+        )
+
     def max_task_output_bytes_to_read(
         self,
         op: PhysicalOperator,
@@ -888,6 +896,20 @@ def update_budgets(
                 remaining_shared
             ):
                 op_shared = op_shared.add(to_borrow)
+
+            # Cap op_shared so that total allocation doesn't exceed max_resource_usage.
+            # Total allocation = max(total_reserved, op_usage) + op_shared
+            # This ensures excess resources stay in remaining_shared for other operators.
+            _, max_resource_usage = op.min_max_resource_requirements()
+            if max_resource_usage != ExecutionResources.inf():
+                total_reserved = self._get_total_reserved(op)
+                op_usage = self._resource_manager.get_op_usage(op)
+                current_allocation = total_reserved.max(op_usage)
+                max_shared = max_resource_usage.subtract(current_allocation).max(
+                    ExecutionResources.zero()
+                )
+                op_shared = op_shared.min(max_shared)
+
             remaining_shared = remaining_shared.subtract(op_shared)
             assert remaining_shared.is_non_negative(), (
                 remaining_shared,
@@ -896,7 +918,10 @@ def update_budgets(
                 to_borrow,
             )
 
-            if op.min_max_resource_requirements()[1].gpu > 0:
+            if (
+                max_resource_usage != ExecutionResources.inf()
+                and max_resource_usage.gpu > 0
+            ):
                 # If an operator needs GPU, we just allocate all GPUs to it.
                 # TODO(hchen): allocate resources across multiple GPU operators.
 
@@ -915,6 +940,17 @@ def update_budgets(
                 self._op_budgets[op].add(op_shared).copy(gpu=target_num_gpu)
             )
 
+        # Give any remaining shared resources to the most downstream uncapped op.
+        # This can happen when some ops have their shared allocation capped.
+        if eligible_ops and not remaining_shared.is_zero():
+            for op in reversed(eligible_ops):
+                _, max_resource_usage = op.min_max_resource_requirements()
+                if max_resource_usage == ExecutionResources.inf():
+                    self._op_budgets[op] = self._op_budgets[op].add(
+                        remaining_shared.copy(gpu=0)
+                    )
+                    break
+
         # A materializing operator like `AllToAllOperator` waits for all its input
         # operator's outputs before processing data. This often forces the input
         # operator to exceed its object store memory budget. To prevent deadlock, we
diff --git a/python/ray/data/tests/test_resource_manager.py b/python/ray/data/tests/test_resource_manager.py
@@ -712,6 +712,140 @@ def test_does_not_reserve_more_than_max_resource_usage(self):
             cpu=1, object_store_memory=1
         )
 
+    def test_budget_capped_by_max_resource_usage(self, restore_data_context):
+        """Test that the total allocation is capped by max_resource_usage.
+
+        Total allocation = max(total_reserved, op_usage) + op_shared
+        We cap op_shared so that total allocation <= max_resource_usage.
+        Excess shared resources should remain available for other operators.
+        """
+        DataContext.get_current().op_resource_reservation_enabled = True
+        DataContext.get_current().op_resource_reservation_ratio = 0.5
+
+        o1 = InputDataBuffer(DataContext.get_current(), [])
+        o2 = mock_map_op(o1, incremental_resource_usage=ExecutionResources(1, 0, 10))
+        o3 = mock_map_op(o2, incremental_resource_usage=ExecutionResources(1, 0, 10))
+
+        # o2 has a small max CPU, so its CPU shared allocation will be capped.
+        # o3 has unlimited max_resource_usage.
+        o2.min_max_resource_requirements = MagicMock(
+            return_value=(
+                ExecutionResources.zero(),
+                ExecutionResources(cpu=4, object_store_memory=float("inf")),
+            )
+        )
+        o3.min_max_resource_requirements = MagicMock(
+            return_value=(
+                ExecutionResources.zero(),
+                ExecutionResources.inf(),
+            )
+        )
+
+        topo = build_streaming_topology(o3, ExecutionOptions())
+
+        global_limits = ExecutionResources(cpu=20, object_store_memory=400)
+
+        op_usages = {
+            o1: ExecutionResources.zero(),
+            o2: ExecutionResources(cpu=2, object_store_memory=40),
+            o3: ExecutionResources(cpu=2, object_store_memory=40),
+        }
+
+        resource_manager = ResourceManager(
+            topo, ExecutionOptions(), MagicMock(), DataContext.get_current()
+        )
+        resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
+        resource_manager._mem_op_internal = {o1: 0, o2: 40, o3: 40}
+        resource_manager._mem_op_outputs = {o1: 0, o2: 0, o3: 0}
+        resource_manager.get_global_limits = MagicMock(return_value=global_limits)
+
+        allocator = resource_manager._op_resource_allocator
+        assert isinstance(allocator, ReservationOpResourceAllocator)
+        allocator.update_budgets(limits=global_limits)
+
+        # All tuples below are (cpu, object_store_memory).
+        #
+        # Reservation phase:
+        # - default_reserved per op = global_limits * 0.5 / 2 = (5, 100)
+        # - reserved_for_outputs per op = 100 / 2 = 50
+        # - o2's reserved_for_tasks is capped by max (4, inf) -> (4, 50)
+        # - o3's reserved_for_tasks = (5, 50)
+        # - total_shared = global_limits - o2_total_reserved - o3_total_reserved
+        #                = (20, 400) - (4, 100) - (5, 100) = (11, 200)
+        #
+        # Budget phase (first loop calculates reserved_remaining):
+        # - o2: reserved_remaining = reserved_for_tasks - usage = (4, 50) - (2, 40) = (2, 10)
+        # - o3: reserved_remaining = (5, 50) - (2, 40) = (3, 10)
+        #
+        # Shared allocation (second loop, reversed order):
+        # - o3: op_shared = remaining_shared / 2 = (5.5, 100), no cap
+        #       budget = reserved_remaining + op_shared = (3, 10) + (5.5, 100) = (8.5, 110)
+        # - o2: op_shared = (5.5, 100), CPU capped to (0, 100)
+        #       budget = (2, 10) + (0, 100) = (2, 110)
+        #       remaining_shared = (5.5, 0)
+        # - After loop, remaining (5.5, 0) given to most downstream op (o3):
+        #       o3 budget = (8.5, 110) + (5.5, 0) = (14, 110)
+        assert allocator._op_budgets[o2] == ExecutionResources(
+            cpu=2, object_store_memory=110
+        )
+        assert allocator._op_budgets[o3] == ExecutionResources(
+            cpu=14, object_store_memory=110
+        )
+
+    def test_budget_capped_by_max_resource_usage_all_capped(self, restore_data_context):
+        """Test when all operators are capped, remaining shared resources are not given."""
+        DataContext.get_current().op_resource_reservation_enabled = True
+        DataContext.get_current().op_resource_reservation_ratio = 0.5
+
+        o1 = InputDataBuffer(DataContext.get_current(), [])
+        o2 = mock_map_op(o1, incremental_resource_usage=ExecutionResources(1, 0, 10))
+        o3 = mock_map_op(o2, incremental_resource_usage=ExecutionResources(1, 0, 10))
+
+        # Both operators are capped.
+        o2.min_max_resource_requirements = MagicMock(
+            return_value=(
+                ExecutionResources.zero(),
+                ExecutionResources(cpu=4, object_store_memory=float("inf")),
+            )
+        )
+        o3.min_max_resource_requirements = MagicMock(
+            return_value=(
+                ExecutionResources.zero(),
+                ExecutionResources(cpu=4, object_store_memory=float("inf")),
+            )
+        )
+
+        topo = build_streaming_topology(o3, ExecutionOptions())
+
+        global_limits = ExecutionResources(cpu=20, object_store_memory=400)
+
+        op_usages = {
+            o1: ExecutionResources.zero(),
+            o2: ExecutionResources(cpu=2, object_store_memory=40),
+            o3: ExecutionResources(cpu=2, object_store_memory=40),
+        }
+
+        resource_manager = ResourceManager(
+            topo, ExecutionOptions(), MagicMock(), DataContext.get_current()
+        )
+        resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
+        resource_manager._mem_op_internal = {o1: 0, o2: 40, o3: 40}
+        resource_manager._mem_op_outputs = {o1: 0, o2: 0, o3: 0}
+        resource_manager.get_global_limits = MagicMock(return_value=global_limits)
+
+        allocator = resource_manager._op_resource_allocator
+        allocator.update_budgets(limits=global_limits)
+
+        # Both ops are capped (max cpu=4), so remaining CPU is not given to any op.
+        # o2: reserved_remaining (2, 10) + capped op_shared (0, 100) = (2, 110)
+        # o3: reserved_remaining (2, 10) + capped op_shared (0, 100) = (2, 110)
+        assert allocator._op_budgets[o2] == ExecutionResources(
+            cpu=2, object_store_memory=110
+        )
+        assert allocator._op_budgets[o3] == ExecutionResources(
+            cpu=2, object_store_memory=110
+        )
+
     def test_only_handle_eligible_ops(self, restore_data_context):
         """Test that we only handle non-completed map ops."""
         DataContext.get_current().op_resource_reservation_enabled = True