Skip to content

Commit a8e9156

Browse files
authored
Get cluster (#189)
* Add: get_cluster function to get cluster with specified name and namespace * Test: make unit tests for get_cluster function
1 parent 18ae25e commit a8e9156

File tree

2 files changed

+93
-3
lines changed

2 files changed

+93
-3
lines changed

src/codeflare_sdk/cluster/cluster.py

+64
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,49 @@ def torchx_config(
301301
to_return["requirements"] = requirements
302302
return to_return
303303

304+
def from_k8_cluster_object(rc):
305+
machine_types = (
306+
rc["metadata"]["labels"]["orderedinstance"].split("_")
307+
if "orderedinstance" in rc["metadata"]["labels"]
308+
else []
309+
)
310+
local_interactive = (
311+
"volumeMounts"
312+
in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0]
313+
)
314+
cluster_config = ClusterConfiguration(
315+
name=rc["metadata"]["name"],
316+
namespace=rc["metadata"]["namespace"],
317+
machine_types=machine_types,
318+
min_worker=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
319+
max_worker=rc["spec"]["workerGroupSpecs"][0]["maxReplicas"],
320+
min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
321+
"containers"
322+
][0]["resources"]["requests"]["cpu"],
323+
max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
324+
"containers"
325+
][0]["resources"]["limits"]["cpu"],
326+
min_memory=int(
327+
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
328+
"resources"
329+
]["requests"]["memory"][:-1]
330+
),
331+
max_memory=int(
332+
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
333+
"resources"
334+
]["limits"]["memory"][:-1]
335+
),
336+
gpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
337+
"resources"
338+
]["limits"]["nvidia.com/gpu"],
339+
instascale=True if machine_types else False,
340+
image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
341+
0
342+
]["image"],
343+
local_interactive=local_interactive,
344+
)
345+
return Cluster(cluster_config)
346+
304347

305348
def list_all_clusters(namespace: str, print_to_console: bool = True):
306349
"""
@@ -337,6 +380,27 @@ def get_current_namespace(): # pragma: no cover
337380
return "default"
338381

339382

383+
def get_cluster(cluster_name: str, namespace: str = "default"):
384+
try:
385+
config.load_kube_config()
386+
api_instance = client.CustomObjectsApi()
387+
rcs = api_instance.list_namespaced_custom_object(
388+
group="ray.io",
389+
version="v1alpha1",
390+
namespace=namespace,
391+
plural="rayclusters",
392+
)
393+
except Exception as e:
394+
return _kube_api_error_handling(e)
395+
396+
for rc in rcs["items"]:
397+
if rc["metadata"]["name"] == cluster_name:
398+
return Cluster.from_k8_cluster_object(rc)
399+
raise FileNotFoundError(
400+
f"Cluster {cluster_name} is not found in {namespace} namespace"
401+
)
402+
403+
340404
# private methods
341405

342406

tests/unit_test.py

+29-3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
list_all_clusters,
3030
list_all_queued,
3131
_copy_to_ray,
32+
get_cluster,
3233
_app_wrapper_status,
3334
_ray_cluster_status,
3435
)
@@ -615,6 +616,7 @@ def get_ray_obj(group, version, namespace, plural, cls=None):
615616
"appwrapper.mcad.ibm.com": "quicktest",
616617
"controller-tools.k8s.io": "1.0",
617618
"resourceName": "quicktest",
619+
"orderedinstance": "m4.xlarge_g4dn.xlarge",
618620
},
619621
"managedFields": [
620622
{
@@ -792,10 +794,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None):
792794
"workerGroupSpecs": [
793795
{
794796
"groupName": "small-group-quicktest",
795-
"maxReplicas": 1,
796-
"minReplicas": 1,
797+
"maxReplicas": 2,
798+
"minReplicas": 2,
797799
"rayStartParams": {"block": "true", "num-gpus": "0"},
798-
"replicas": 1,
800+
"replicas": 2,
799801
"template": {
800802
"metadata": {
801803
"annotations": {"key": "value"},
@@ -1530,6 +1532,30 @@ def get_aw_obj(group, version, namespace, plural):
15301532
return api_obj1
15311533

15321534

1535+
def test_get_cluster(mocker):
1536+
mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
1537+
mocker.patch(
1538+
"kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
1539+
side_effect=get_ray_obj,
1540+
)
1541+
cluster = get_cluster("quicktest")
1542+
cluster_config = cluster.config
1543+
assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns"
1544+
assert (
1545+
"m4.xlarge" in cluster_config.machine_types
1546+
and "g4dn.xlarge" in cluster_config.machine_types
1547+
)
1548+
assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1
1549+
assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2
1550+
assert cluster_config.gpu == 0
1551+
assert cluster_config.instascale
1552+
assert (
1553+
cluster_config.image
1554+
== "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
1555+
)
1556+
assert cluster_config.min_worker == 2 and cluster_config.max_worker == 2
1557+
1558+
15331559
def test_list_clusters(mocker, capsys):
15341560
mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
15351561
mocker.patch(

0 commit comments

Comments
 (0)