Skip to content

Commit ec47475

Browse files
feat: split head resources for limits and requests
Signed-off-by: Bobbins228 <[email protected]>
1 parent 9794a0f commit ec47475

File tree

5 files changed

+77
-35
lines changed

5 files changed

+77
-35
lines changed

src/codeflare_sdk/cluster/cluster.py

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,18 @@ def from_k8_cluster_object(
462462
name=rc["metadata"]["name"],
463463
namespace=rc["metadata"]["namespace"],
464464
machine_types=machine_types,
465+
head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][
466+
"containers"
467+
][0]["resources"]["requests"]["cpu"],
468+
head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][
469+
"containers"
470+
][0]["resources"]["limits"]["cpu"],
471+
head_memory_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][
472+
"containers"
473+
][0]["resources"]["requests"]["memory"],
474+
head_memory_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][
475+
"containers"
476+
][0]["resources"]["limits"]["memory"],
465477
num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
466478
worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
467479
"containers"
@@ -851,23 +863,29 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
851863
status=status,
852864
# for now we are not using autoscaling so same replicas is fine
853865
workers=rc["spec"]["workerGroupSpecs"][0]["replicas"],
854-
worker_mem_max=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
866+
worker_mem_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
855867
"containers"
856868
][0]["resources"]["limits"]["memory"],
857-
worker_mem_min=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
869+
worker_mem_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
858870
"containers"
859871
][0]["resources"]["requests"]["memory"],
860872
worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
861873
0
862874
]["resources"]["limits"]["cpu"],
863875
worker_extended_resources=worker_extended_resources,
864876
namespace=rc["metadata"]["namespace"],
865-
head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
866-
"resources"
867-
]["limits"]["cpu"],
868-
head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
869-
"resources"
870-
]["limits"]["memory"],
877+
head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
878+
0
879+
]["resources"]["requests"]["cpu"],
880+
head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
881+
0
882+
]["resources"]["limits"]["cpu"],
883+
head_mem_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
884+
0
885+
]["resources"]["requests"]["memory"],
886+
head_mem_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
887+
0
888+
]["resources"]["limits"]["memory"],
871889
head_extended_resources=head_extended_resources,
872890
dashboard=dashboard_url,
873891
)
@@ -890,14 +908,16 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
890908
name=cluster.config.name,
891909
status=cluster.status(print_to_console=False)[0],
892910
workers=cluster.config.num_workers,
893-
worker_mem_min=cluster.config.worker_memory_requests,
894-
worker_mem_max=cluster.config.worker_memory_limits,
911+
worker_mem_requests=cluster.config.worker_memory_requests,
912+
worker_mem_limits=cluster.config.worker_memory_limits,
895913
worker_cpu=cluster.config.worker_cpu_requests,
896914
worker_extended_resources=cluster.config.worker_extended_resource_requests,
897915
namespace=cluster.config.namespace,
898916
dashboard=cluster.cluster_dashboard_uri(),
899-
head_cpus=cluster.config.head_cpus,
900-
head_mem=cluster.config.head_memory,
917+
head_mem_requests=cluster.config.head_memory_requests,
918+
head_mem_limits=cluster.config.head_memory_limits,
919+
head_cpu_requests=cluster.config.head_cpu_requests,
920+
head_cpu_limits=cluster.config.head_cpu_limits,
901921
head_extended_resources=cluster.config.head_extended_resource_requests,
902922
)
903923
if ray.status == CodeFlareClusterStatus.READY:

src/codeflare_sdk/cluster/config.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,16 @@ class ClusterConfiguration:
7575
name: str
7676
namespace: Optional[str] = None
7777
head_info: List[str] = field(default_factory=list)
78-
head_cpus: Union[int, str] = 2
79-
head_memory: Union[int, str] = 8
78+
head_cpu_requests: Union[int, str] = 2
79+
head_cpu_limits: Union[int, str] = 2
80+
head_cpus: Optional[Union[int, str]] = None # Deprecating
81+
head_memory_requests: Union[int, str] = 8
82+
head_memory_limits: Union[int, str] = 8
83+
head_memory: Optional[Union[int, str]] = None # Deprecating
8084
head_gpus: Optional[int] = None # Deprecating
81-
head_extended_resource_requests: Dict[str, int] = field(default_factory=dict)
85+
head_extended_resource_requests: Dict[str, Union[str, int]] = field(
86+
default_factory=dict
87+
)
8288
machine_types: List[str] = field(
8389
default_factory=list
8490
) # ["m4.xlarge", "g4dn.xlarge"]
@@ -100,7 +106,9 @@ class ClusterConfiguration:
100106
write_to_file: bool = False
101107
verify_tls: bool = True
102108
labels: Dict[str, str] = field(default_factory=dict)
103-
worker_extended_resource_requests: Dict[str, int] = field(default_factory=dict)
109+
worker_extended_resource_requests: Dict[str, Union[str, int]] = field(
110+
default_factory=dict
111+
)
104112
extended_resource_mapping: Dict[str, str] = field(default_factory=dict)
105113
overwrite_default_resource_mapping: bool = False
106114
local_queue: Optional[str] = None
@@ -183,14 +191,21 @@ def _str_mem_no_unit_add_GB(self):
183191
self.worker_memory_limits = f"{self.worker_memory_limits}G"
184192

185193
def _memory_to_string(self):
186-
if isinstance(self.head_memory, int):
187-
self.head_memory = f"{self.head_memory}G"
194+
if isinstance(self.head_memory_requests, int):
195+
self.head_memory_requests = f"{self.head_memory_requests}G"
196+
if isinstance(self.head_memory_limits, int):
197+
self.head_memory_limits = f"{self.head_memory_limits}G"
188198
if isinstance(self.worker_memory_requests, int):
189199
self.worker_memory_requests = f"{self.worker_memory_requests}G"
190200
if isinstance(self.worker_memory_limits, int):
191201
self.worker_memory_limits = f"{self.worker_memory_limits}G"
192202

193203
def _cpu_to_resource(self):
204+
if self.head_cpus:
205+
warnings.warn(
206+
"head_cpus is being deprecated, use head_cpu_requests and head_cpu_limits"
207+
)
208+
self.head_cpu_requests = self.head_cpu_limits = self.head_cpus
194209
if self.min_cpus:
195210
warnings.warn("min_cpus is being deprecated, use worker_cpu_requests")
196211
self.worker_cpu_requests = self.min_cpus
@@ -199,6 +214,11 @@ def _cpu_to_resource(self):
199214
self.worker_cpu_limits = self.max_cpus
200215

201216
def _memory_to_resource(self):
217+
if self.head_memory:
218+
warnings.warn(
219+
"head_memory is being deprecated, use head_memory_requests and head_memory_limits"
220+
)
221+
self.head_memory_requests = self.head_memory_limits = self.head_memory
202222
if self.min_memory:
203223
warnings.warn("min_memory is being deprecated, use worker_memory_requests")
204224
self.worker_memory_requests = f"{self.min_memory}G"

src/codeflare_sdk/cluster/model.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,13 @@ class RayCluster:
7373

7474
name: str
7575
status: RayClusterStatus
76-
head_cpus: int
77-
head_mem: str
76+
head_cpu_requests: int
77+
head_cpu_limits: int
78+
head_mem_requests: str
79+
head_mem_limits: str
7880
workers: int
79-
worker_mem_min: str
80-
worker_mem_max: str
81+
worker_mem_requests: str
82+
worker_mem_limits: str
8183
worker_cpu: int
8284
namespace: str
8385
dashboard: str

src/codeflare_sdk/utils/generate_yaml.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -115,22 +115,22 @@ def update_env(spec, env):
115115

116116
def update_resources(
117117
spec,
118-
worker_cpu_requests,
119-
worker_cpu_limits,
120-
worker_memory_requests,
121-
worker_memory_limits,
118+
cpu_requests,
119+
cpu_limits,
120+
memory_requests,
121+
memory_limits,
122122
custom_resources,
123123
):
124124
container = spec.get("containers")
125125
for resource in container:
126126
requests = resource.get("resources").get("requests")
127127
if requests is not None:
128-
requests["cpu"] = worker_cpu_requests
129-
requests["memory"] = worker_memory_requests
128+
requests["cpu"] = cpu_requests
129+
requests["memory"] = memory_requests
130130
limits = resource.get("resources").get("limits")
131131
if limits is not None:
132-
limits["cpu"] = worker_cpu_limits
133-
limits["memory"] = worker_memory_limits
132+
limits["cpu"] = cpu_limits
133+
limits["memory"] = memory_limits
134134
for k in custom_resources.keys():
135135
limits[k] = custom_resources[k]
136136
requests[k] = custom_resources[k]
@@ -210,10 +210,10 @@ def update_nodes(
210210
# TODO: Eventually add head node configuration outside of template
211211
update_resources(
212212
spec,
213-
cluster.config.head_cpus,
214-
cluster.config.head_cpus,
215-
cluster.config.head_memory,
216-
cluster.config.head_memory,
213+
cluster.config.head_cpu_requests,
214+
cluster.config.head_cpu_limits,
215+
cluster.config.head_memory_requests,
216+
cluster.config.head_memory_limits,
217217
cluster.config.head_extended_resource_requests,
218218
)
219219
else:

src/codeflare_sdk/utils/pretty_print.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def print_clusters(clusters: List[RayCluster]):
136136
name = cluster.name
137137
dashboard = cluster.dashboard
138138
workers = str(cluster.workers)
139-
memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}"
139+
memory = f"{cluster.worker_mem_requests}~{cluster.worker_mem_limits}"
140140
cpu = str(cluster.worker_cpu)
141141
gpu = str(cluster.worker_extended_resources.get("nvidia.com/gpu", 0))
142142

0 commit comments

Comments
 (0)