Skip to content

[Subnet Prioritization] Support capacity-optimized-prioritized and prioritized Allocation Strategy #671

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jul 9, 2025
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
430c9a3
[Subnet Prioritization] Add SingleAvailabilityZone parameter to ec2 f…
Allenz5 Jun 11, 2025
2d21a5a
[Subnet Prioritization] Add Priority parameter to ec2 fleet call
Allenz5 Jun 11, 2025
7f70084
[Subnet Prioritization] Extend test_evaluate_launch_params to test pa…
Allenz5 Jun 17, 2025
01d3219
[Subnet Prioritization] Update CHANGELOG.md
Allenz5 Jun 17, 2025
c905f70
Merge branch 'aws:develop' into develop
Allenz5 Jun 17, 2025
4f2a6cb
[Subnet Prioritization] Check AllocationStrategy is prioritized|capac…
Allenz5 Jun 18, 2025
ddd8e41
Merge branch 'develop' of github.com:Allenz5/aws-parallelcluster-node…
Allenz5 Jun 18, 2025
56309ea
[Subnet Prioritization] Update CHANGELOG.md
Allenz5 Jun 20, 2025
cf11486
[Subnet Prioritization] Reformat the code to make it more clear
Allenz5 Jun 20, 2025
b028b40
[Subnet Prioritization] Reformat the code to make it more clear
Allenz5 Jun 20, 2025
a0a4775
[Subnet Prioritization] Add a valid condition to check that 'prioriti…
Allenz5 Jun 24, 2025
1efee32
[Subnet Prioritization] Remove EnableSingleAvailabilityZone parameter
Allenz5 Jul 7, 2025
e57dab8
[Subnet Prioritization] Update unit tests to test that priority is co…
Allenz5 Jul 9, 2025
f3cfc65
[Subnet Prioritization] Update unit tests to test that priority is co…
Allenz5 Jul 9, 2025
552dd0a
Merge branch 'develop' into develop
himani2411 Jul 9, 2025
7bbae42
[Subnet Prioritization] Update CHANGELOG.md
Allenz5 Jul 9, 2025
ef6f189
Merge remote-tracking branch 'origin/develop' into develop
Allenz5 Jul 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG

This file is used to list changes made in each version of the aws-parallelcluster-node package.

3.14.0
------

**CHANGES**
- Support prioritized|capacity-optimized-prioritized Allocation Strategy and EnableSingleAvailabilityZone

3.13.2
------

Expand Down
28 changes: 24 additions & 4 deletions src/slurm_plugin/fleet_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,11 +296,18 @@ def _evaluate_template_overrides(self) -> list:
if self._compute_resource_config.get("MaxPrice"):
overrides.update({"MaxPrice": str(self._compute_resource_config["MaxPrice"])})

priority = 0.0
for instance_type in self._compute_resource_config["Instances"]:
subnet_ids = self._compute_resource_config["Networking"]["SubnetIds"]
subnet_ids = self._compute_resource_config.get("Networking", {}).get("SubnetIds", [])
for subnet_id in subnet_ids:
overrides.update({"InstanceType": instance_type["InstanceType"], "SubnetId": subnet_id})
if self._uses_subnet_prioritization():
overrides.update(
{"InstanceType": instance_type["InstanceType"], "SubnetId": subnet_id, "Priority": priority}
)
else:
overrides.update({"InstanceType": instance_type["InstanceType"], "SubnetId": subnet_id})
template_overrides.append(copy.deepcopy(overrides))
priority += 1.0
return template_overrides

def _uses_single_instance_type(self):
Expand All @@ -312,13 +319,26 @@ def _uses_single_az(self):
subnet_ids = self._compute_resource_config.get("Networking", {}).get("SubnetIds", [])
return len(subnet_ids) == 1

def _uses_subnet_prioritization(self):
return (
self._compute_resource_config.get("AllocationStrategy") == "prioritized"
or self._compute_resource_config.get("AllocationStrategy") == "capacity-optimized-prioritized"
)

def _evaluate_launch_params(self, count):
"""Evaluate parameters to be passed to create_fleet call."""
try:
enable_single_availability_zone = self._compute_resource_config.get("Networking", {}).get(
"SingleAvailabilityZone", None
)
if enable_single_availability_zone is None or (
enable_single_availability_zone and self._uses_subnet_prioritization() is False
):
enable_single_availability_zone = self._uses_single_az()

common_launch_options = {
"SingleInstanceType": self._uses_single_instance_type(),
"SingleAvailabilityZone": self._uses_single_az(), # If using Multi-AZ (by specifying multiple subnets),
# set SingleAvailabilityZone to False
"SingleAvailabilityZone": enable_single_availability_zone,
}
allocation_strategy = self._compute_resource_config.get("AllocationStrategy")
if allocation_strategy:
Expand Down
35 changes: 33 additions & 2 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ def client_error(error_code):
return ClientError({"Error": {"Code": error_code}}, "failed_operation")


SINGLE_SUBNET = {"SubnetIds": ["1234567"]}
MULTIPLE_SUBNETS = {"SubnetIds": ["1234567", "7654321"]}
SINGLE_SUBNET = {"SubnetIds": ["1234567"], "SingleAvailabilityZone": None}
MULTIPLE_SUBNETS = {"SubnetIds": ["1234567", "7654321"], "SingleAvailabilityZone": None}
MULTIPLE_SUBNET_ENABLE_SINGLE_AVAILABILITY_ZONE = {"SubnetIds": ["1234567", "7654321"], "SingleAvailabilityZone": True}

FLEET_CONFIG = {
"queue": {"c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]}},
Expand Down Expand Up @@ -110,6 +111,36 @@ def client_error(error_code):
"CapacityReservationId": "cr-234567",
},
},
"queue-single-az": {
"c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]},
"fleet1": {
"Api": "create-fleet",
"Instances": [{"InstanceType": "t2.medium"}, {"InstanceType": "t2.large"}],
"AllocationStrategy": "prioritized",
"CapacityType": "on-demand",
"Networking": MULTIPLE_SUBNET_ENABLE_SINGLE_AVAILABILITY_ZONE,
},
},
"queue-prioritized": {
"c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]},
"fleet1": {
"Api": "create-fleet",
"Instances": [{"InstanceType": "t2.medium"}, {"InstanceType": "t2.large"}],
"AllocationStrategy": "prioritized",
"CapacityType": "on-demand",
"Networking": MULTIPLE_SUBNET_ENABLE_SINGLE_AVAILABILITY_ZONE,
},
},
"queue-capacity-optimized-prioritized": {
"c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]},
"fleet1": {
"Api": "create-fleet",
"Instances": [{"InstanceType": "t2.medium"}, {"InstanceType": "t2.large"}],
"AllocationStrategy": "capacity-optimized-prioritized",
"CapacityType": "on-demand",
"Networking": MULTIPLE_SUBNET_ENABLE_SINGLE_AVAILABILITY_ZONE,
},
},
}

LAUNCH_OVERRIDES = {}
9 changes: 9 additions & 0 deletions tests/slurm_plugin/test_fleet_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,12 @@ class TestEc2CreateFleetManager:
{},
"All-or-Nothing is only available with single instance type compute resources or single subnet queues",
),
# Enable Single Availability Zone
(5, "queue-single-az", "fleet1", False, {}, None),
# Use "prioritized" Allocation Strategy AND Launch Override with Priority
(5, "queue-prioritized", "fleet1", False, {}, None),
# Use "capacity-optimized-prioritized" Allocation Strategy AND Launch Override with Priority
(5, "queue-capacity-optimized-prioritized", "fleet1", False, {}, None),
],
ids=[
"fleet_spot",
Expand All @@ -402,6 +408,9 @@ class TestEc2CreateFleetManager:
"fleet-multi-az-single-it-all_or_nothing",
"fleet-multi-az-multi-it",
"fleet-multi-az-multi-it-all_or_nothing",
"single_az",
"prioritized",
"capacity_optimized_prioritized",
],
)
def test_evaluate_launch_params(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"LaunchTemplateConfigs": [
{
"LaunchTemplateSpecification": {
"LaunchTemplateName": "hit-queue-capacity-optimized-prioritized-fleet1",
"Version": "$Latest"
},
"Overrides": [
{
"InstanceType": "t2.medium",
"SubnetId": "1234567",
"Priority": 0.0
},
{
"InstanceType": "t2.medium",
"SubnetId": "7654321",
"Priority": 1.0
},
{
"InstanceType": "t2.large",
"SubnetId": "1234567",
"Priority": 2.0
},
{
"InstanceType": "t2.large",
"SubnetId": "7654321",
"Priority": 3.0
}
]
}
],
"OnDemandOptions": {
"AllocationStrategy": "capacity-optimized-prioritized",
"SingleInstanceType": false,
"SingleAvailabilityZone": true,
"CapacityReservationOptions": {
"UsageStrategy": "use-capacity-reservations-first"
}
},
"TargetCapacitySpecification": {
"TotalTargetCapacity": 5,
"DefaultTargetCapacityType": "on-demand"
},
"Type": "instant"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"LaunchTemplateConfigs": [
{
"LaunchTemplateSpecification": {
"LaunchTemplateName": "hit-queue-prioritized-fleet1",
"Version": "$Latest"
},
"Overrides": [
{
"InstanceType": "t2.medium",
"SubnetId": "1234567",
"Priority": 0.0
},
{
"InstanceType": "t2.medium",
"SubnetId": "7654321",
"Priority": 1.0
},
{
"InstanceType": "t2.large",
"SubnetId": "1234567",
"Priority": 2.0
},
{
"InstanceType": "t2.large",
"SubnetId": "7654321",
"Priority": 3.0
}
]
}
],
"OnDemandOptions": {
"AllocationStrategy": "prioritized",
"SingleInstanceType": false,
"SingleAvailabilityZone": true,
"CapacityReservationOptions": {
"UsageStrategy": "use-capacity-reservations-first"
}
},
"TargetCapacitySpecification": {
"TotalTargetCapacity": 5,
"DefaultTargetCapacityType": "on-demand"
},
"Type": "instant"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"LaunchTemplateConfigs": [
{
"LaunchTemplateSpecification": {
"LaunchTemplateName": "hit-queue-single-az-fleet1",
"Version": "$Latest"
},
"Overrides": [
{
"InstanceType": "t2.medium",
"SubnetId": "1234567",
"Priority": 0.0
},
{
"InstanceType": "t2.medium",
"SubnetId": "7654321",
"Priority": 1.0
},
{
"InstanceType": "t2.large",
"SubnetId": "1234567",
"Priority": 2.0
},
{
"InstanceType": "t2.large",
"SubnetId": "7654321",
"Priority": 3.0
}
]
}
],
"OnDemandOptions": {
"AllocationStrategy": "prioritized",
"SingleInstanceType": false,
"SingleAvailabilityZone": true,
"CapacityReservationOptions": {
"UsageStrategy": "use-capacity-reservations-first"
}
},
"TargetCapacitySpecification": {
"TotalTargetCapacity": 5,
"DefaultTargetCapacityType": "on-demand"
},
"Type": "instant"
}