Skip to content

Commit 96e3a8e

Browse files
committed
Split head memory and cpu requests/limits
1 parent a36ebdb commit 96e3a8e

11 files changed

+119
-54
lines changed

src/codeflare_sdk/cluster/cluster.py

+24-12
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,10 @@ def create_app_wrapper(self):
135135

136136
name = self.config.name
137137
namespace = self.config.namespace
138-
head_cpus = self.config.head_cpus
139-
head_memory = self.config.head_memory
138+
head_cpu_requests = self.config.head_cpu_requests
139+
head_cpu_limits = self.config.head_cpu_limits
140+
head_memory_requests = self.config.head_memory_requests
141+
head_memory_limits = self.config.head_memory_limits
140142
num_head_gpus = self.config.num_head_gpus
141143
worker_cpu_requests = self.config.worker_cpu_requests
142144
worker_cpu_limits = self.config.worker_cpu_limits
@@ -155,8 +157,10 @@ def create_app_wrapper(self):
155157
return generate_appwrapper(
156158
name=name,
157159
namespace=namespace,
158-
head_cpus=head_cpus,
159-
head_memory=head_memory,
160+
head_cpu_requests=head_cpu_requests,
161+
head_cpu_limits=head_cpu_limits,
162+
head_memory_requests=head_memory_requests,
163+
head_memory_limits=head_memory_limits,
160164
num_head_gpus=num_head_gpus,
161165
worker_cpu_requests=worker_cpu_requests,
162166
worker_cpu_limits=worker_cpu_limits,
@@ -887,12 +891,18 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
887891
]["resources"]["limits"]["cpu"],
888892
worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for
889893
namespace=rc["metadata"]["namespace"],
890-
head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
891-
"resources"
892-
]["limits"]["cpu"],
893-
head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
894-
"resources"
895-
]["limits"]["memory"],
894+
head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
895+
0
896+
]["resources"]["requests"]["cpu"],
897+
head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
898+
0
899+
]["resources"]["limits"]["cpu"],
900+
head_mem_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
901+
0
902+
]["resources"]["requests"]["memory"],
903+
head_mem_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
904+
0
905+
]["resources"]["limits"]["memory"],
896906
head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
897907
"resources"
898908
]["limits"]["nvidia.com/gpu"],
@@ -923,8 +933,10 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
923933
worker_gpu=cluster.config.num_worker_gpus,
924934
namespace=cluster.config.namespace,
925935
dashboard=cluster.cluster_dashboard_uri(),
926-
head_cpus=cluster.config.head_cpus,
927-
head_mem=cluster.config.head_memory,
936+
head_mem_requests=cluster.config.head_memory_requests,
937+
head_mem_limits=cluster.config.head_memory_limits,
938+
head_cpu_requests=cluster.config.head_cpu_requests,
939+
head_cpu_limits=cluster.config.head_cpu_limits,
928940
head_gpu=cluster.config.num_head_gpus,
929941
)
930942
if ray.status == CodeFlareClusterStatus.READY:

src/codeflare_sdk/cluster/config.py

+30-6
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,12 @@ class ClusterConfiguration:
3636
name: str
3737
namespace: str = None
3838
head_info: list = field(default_factory=list)
39-
head_cpus: typing.Union[int, str] = 2
40-
head_memory: typing.Union[int, str] = 8
39+
head_cpu_requests: typing.Union[int, str] = 2
40+
head_cpu_limits: typing.Union[int, str] = 2
41+
head_cpus: typing.Union[int, str] = None # Deprecating
42+
head_memory_requests: typing.Union[int, str] = 8
43+
head_memory_limits: typing.Union[int, str] = 8
44+
head_memory: typing.Union[int, str] = None # Deprecating
4145
head_gpus: int = None # Deprecating
4246
num_head_gpus: int = 0
4347
machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"]
@@ -74,8 +78,16 @@ def __post_init__(self):
7478
self._cpu_to_resource()
7579

7680
def _str_mem_no_unit_add_GB(self):
77-
if isinstance(self.head_memory, str) and self.head_memory.isdecimal():
78-
self.head_memory = f"{self.head_memory}G"
81+
if (
82+
isinstance(self.head_memory_requests, str)
83+
and self.head_memory_requests.isdecimal()
84+
):
85+
self.head_memory_requests = f"{self.head_memory_requests}G"
86+
if (
87+
isinstance(self.head_memory_limits, str)
88+
and self.head_memory_limits.isdecimal()
89+
):
90+
self.head_memory_limits = f"{self.head_memory_limits}G"
7991
if (
8092
isinstance(self.worker_memory_requests, str)
8193
and self.worker_memory_requests.isdecimal()
@@ -88,8 +100,10 @@ def _str_mem_no_unit_add_GB(self):
88100
self.worker_memory_limits = f"{self.worker_memory_limits}G"
89101

90102
def _memory_to_string(self):
91-
if isinstance(self.head_memory, int):
92-
self.head_memory = f"{self.head_memory}G"
103+
if isinstance(self.head_memory_requests, int):
104+
self.head_memory_requests = f"{self.head_memory_requests}G"
105+
if isinstance(self.head_memory_limits, int):
106+
self.head_memory_limits = f"{self.head_memory_limits}G"
93107
if isinstance(self.worker_memory_requests, int):
94108
self.worker_memory_requests = f"{self.worker_memory_requests}G"
95109
if isinstance(self.worker_memory_limits, int):
@@ -104,6 +118,11 @@ def _gpu_to_resource(self):
104118
self.num_worker_gpus = self.num_gpus
105119

106120
def _cpu_to_resource(self):
121+
if self.head_cpus:
122+
warnings.warn(
123+
"head_cpus is being deprecated, use head_cpu_requests and head_cpu_limits"
124+
)
125+
self.head_cpu_requests = self.head_cpu_limits = self.head_cpus
107126
if self.min_cpus:
108127
warnings.warn("min_cpus is being deprecated, use worker_cpu_requests")
109128
self.worker_cpu_requests = self.min_cpus
@@ -112,6 +131,11 @@ def _cpu_to_resource(self):
112131
self.worker_cpu_limits = self.max_cpus
113132

114133
def _memory_to_resource(self):
134+
if self.head_memory:
135+
warnings.warn(
136+
"head_memory is being deprecated, use head_memory_requests and head_memory_limits"
137+
)
138+
self.head_memory_requests = self.head_memory_limits = self.head_memory
115139
if self.min_memory:
116140
warnings.warn("min_memory is being deprecated, use worker_memory_requests")
117141
self.worker_memory_requests = f"{self.min_memory}G"

src/codeflare_sdk/cluster/model.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,10 @@ class RayCluster:
7272

7373
name: str
7474
status: RayClusterStatus
75-
head_cpus: int
76-
head_mem: str
75+
head_cpu_requests: int
76+
head_cpu_limits: int
77+
head_mem_requests: str
78+
head_mem_limits: str
7779
head_gpu: int
7880
workers: int
7981
worker_mem_min: str

src/codeflare_sdk/utils/generate_yaml.py

+29-18
Original file line numberDiff line numberDiff line change
@@ -108,24 +108,24 @@ def update_env(spec, env):
108108

109109
def update_resources(
110110
spec,
111-
worker_cpu_requests,
112-
worker_cpu_limits,
113-
worker_memory_requests,
114-
worker_memory_limits,
115-
num_worker_gpus,
111+
cpu_requests,
112+
cpu_limits,
113+
memory_requests,
114+
memory_limits,
115+
num_gpus,
116116
):
117117
container = spec.get("containers")
118118
for resource in container:
119119
requests = resource.get("resources").get("requests")
120120
if requests is not None:
121-
requests["cpu"] = worker_cpu_requests
122-
requests["memory"] = worker_memory_requests
123-
requests["nvidia.com/gpu"] = num_worker_gpus
121+
requests["cpu"] = cpu_requests
122+
requests["memory"] = memory_requests
123+
requests["nvidia.com/gpu"] = num_gpus
124124
limits = resource.get("resources").get("limits")
125125
if limits is not None:
126-
limits["cpu"] = worker_cpu_limits
127-
limits["memory"] = worker_memory_limits
128-
limits["nvidia.com/gpu"] = num_worker_gpus
126+
limits["cpu"] = cpu_limits
127+
limits["memory"] = memory_limits
128+
limits["nvidia.com/gpu"] = num_gpus
129129

130130

131131
def update_nodes(
@@ -140,8 +140,10 @@ def update_nodes(
140140
image,
141141
env,
142142
image_pull_secrets,
143-
head_cpus,
144-
head_memory,
143+
head_cpu_requests,
144+
head_cpu_limits,
145+
head_memory_requests,
146+
head_memory_limits,
145147
num_head_gpus,
146148
):
147149
head = cluster_yaml.get("spec").get("headGroupSpec")
@@ -163,7 +165,12 @@ def update_nodes(
163165
if comp == head:
164166
# TODO: Eventually add head node configuration outside of template
165167
update_resources(
166-
spec, head_cpus, head_cpus, head_memory, head_memory, num_head_gpus
168+
spec,
169+
head_cpu_requests,
170+
head_cpu_limits,
171+
head_memory_requests,
172+
head_memory_limits,
173+
num_head_gpus,
167174
)
168175
else:
169176
update_resources(
@@ -277,8 +284,10 @@ def write_user_yaml(user_yaml, output_file_name):
277284
def generate_appwrapper(
278285
name: str,
279286
namespace: str,
280-
head_cpus: int,
281-
head_memory: int,
287+
head_cpu_requests: int,
288+
head_cpu_limits: int,
289+
head_memory_requests: int,
290+
head_memory_limits: int,
282291
num_head_gpus: int,
283292
worker_cpu_requests: int,
284293
worker_cpu_limits: int,
@@ -310,8 +319,10 @@ def generate_appwrapper(
310319
image,
311320
env,
312321
image_pull_secrets,
313-
head_cpus,
314-
head_memory,
322+
head_cpu_requests,
323+
head_cpu_limits,
324+
head_memory_requests,
325+
head_memory_limits,
315326
num_head_gpus,
316327
)
317328
augment_labels(cluster_yaml, labels)

tests/e2e/local_interactive_sdk_kind_test.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,10 @@ def run_local_interactives(self):
3737
name=cluster_name,
3838
namespace=self.namespace,
3939
num_workers=1,
40-
head_cpus="500m",
41-
head_memory=2,
40+
head_cpu_requests="500m",
41+
head_cpu_limits="500m",
42+
head_memory_requests=2,
43+
head_memory_limits=2,
4244
worker_cpu_requests="500m",
4345
worker_cpu_limits=1,
4446
worker_memory_requests=1,

tests/e2e/mnist_raycluster_sdk_aw_kind_test.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,10 @@ def run_mnist_raycluster_sdk_kind(self):
3434
name="mnist",
3535
namespace=self.namespace,
3636
num_workers=1,
37-
head_cpus="500m",
38-
head_memory=2,
37+
head_cpu_requests="500m",
38+
head_cpu_limits="500m",
39+
head_memory_requests=2,
40+
head_memory_limits=2,
3941
min_cpus="500m",
4042
max_cpus=1,
4143
min_memory=1,

tests/e2e/mnist_raycluster_sdk_kind_test.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,10 @@ def run_mnist_raycluster_sdk_kind(self):
3535
name="mnist",
3636
namespace=self.namespace,
3737
num_workers=1,
38-
head_cpus="500m",
39-
head_memory=2,
38+
head_cpu_requests="500m",
39+
head_cpu_limits="500m",
40+
head_memory_requests=2,
41+
head_memory_limits=2,
4042
worker_cpu_requests="500m",
4143
worker_cpu_limits=1,
4244
worker_memory_requests=1,

tests/e2e/mnist_raycluster_sdk_oauth_test.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,10 @@ def run_mnist_raycluster_sdk_oauth(self):
4242
name="mnist",
4343
namespace=self.namespace,
4444
num_workers=1,
45-
head_cpus="500m",
46-
head_memory=2,
45+
head_cpu_requests="500m",
46+
head_cpu_limits="500m",
47+
head_memory_requests=2,
48+
head_memory_limits=2,
4749
worker_cpu_requests="500m",
4850
worker_cpu_limits=1,
4951
worker_memory_requests=1,

tests/e2e/start_ray_cluster.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313
name="mnist",
1414
namespace=namespace,
1515
num_workers=1,
16-
head_cpus="500m",
17-
head_memory=2,
16+
head_cpu_requests="500m",
17+
head_cpu_limits="500m",
18+
head_memory_requests=2,
19+
head_memory_limits=2,
1820
worker_cpu_requests="500m",
1921
worker_cpu_limits=1,
2022
worker_memory_requests=1,

tests/unit_test.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -877,8 +877,10 @@ def test_ray_details(mocker, capsys):
877877
worker_gpu=0,
878878
namespace="ns",
879879
dashboard="fake-uri",
880-
head_cpus=2,
881-
head_mem=8,
880+
head_cpu_requests=2,
881+
head_cpu_limits=2,
882+
head_mem_limits=8,
883+
head_mem_requests=8,
882884
head_gpu=0,
883885
)
884886
mocker.patch(
@@ -2304,8 +2306,10 @@ def test_cluster_status(mocker):
23042306
worker_gpu=0,
23052307
namespace="ns",
23062308
dashboard="fake-uri",
2307-
head_cpus=2,
2308-
head_mem=8,
2309+
head_cpu_requests=2,
2310+
head_cpu_limits=2,
2311+
head_mem_limits=8,
2312+
head_mem_requests=8,
23092313
head_gpu=0,
23102314
)
23112315
cf = Cluster(

tests/upgrade/raycluster_sdk_upgrade_test.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,10 @@ def run_mnist_raycluster_sdk_oauth(self):
4848
name="mnist",
4949
namespace=self.namespace,
5050
num_workers=1,
51-
head_cpus=1,
52-
head_memory=2,
51+
head_cpu_requests=1,
52+
head_cpu_limits=1,
53+
head_memory_requests=2,
54+
head_memory_limits=2,
5355
worker_cpu_requests=1,
5456
worker_cpu_limits=1,
5557
worker_memory_requests=1,

0 commit comments

Comments
 (0)