Get cluster (#189)

carsonmh · web-flow · commit a8e9156964cc · 2023-07-06T15:21:11.000-04:00
* Add: get_cluster function to get cluster with specified name and namespace

* Test: make unit tests for get_cluster function
diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py
@@ -301,6 +301,49 @@ def torchx_config(
             to_return["requirements"] = requirements
         return to_return
 
+    def from_k8_cluster_object(rc):
+        machine_types = (
+            rc["metadata"]["labels"]["orderedinstance"].split("_")
+            if "orderedinstance" in rc["metadata"]["labels"]
+            else []
+        )
+        local_interactive = (
+            "volumeMounts"
+            in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0]
+        )
+        cluster_config = ClusterConfiguration(
+            name=rc["metadata"]["name"],
+            namespace=rc["metadata"]["namespace"],
+            machine_types=machine_types,
+            min_worker=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
+            max_worker=rc["spec"]["workerGroupSpecs"][0]["maxReplicas"],
+            min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
+                "containers"
+            ][0]["resources"]["requests"]["cpu"],
+            max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
+                "containers"
+            ][0]["resources"]["limits"]["cpu"],
+            min_memory=int(
+                rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
+                    "resources"
+                ]["requests"]["memory"][:-1]
+            ),
+            max_memory=int(
+                rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
+                    "resources"
+                ]["limits"]["memory"][:-1]
+            ),
+            gpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
+                "resources"
+            ]["limits"]["nvidia.com/gpu"],
+            instascale=True if machine_types else False,
+            image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
+                0
+            ]["image"],
+            local_interactive=local_interactive,
+        )
+        return Cluster(cluster_config)
+
 
 def list_all_clusters(namespace: str, print_to_console: bool = True):
     """
@@ -337,6 +380,27 @@ def get_current_namespace():  # pragma: no cover
         return "default"
 
 
+def get_cluster(cluster_name: str, namespace: str = "default"):
+    try:
+        config.load_kube_config()
+        api_instance = client.CustomObjectsApi()
+        rcs = api_instance.list_namespaced_custom_object(
+            group="ray.io",
+            version="v1alpha1",
+            namespace=namespace,
+            plural="rayclusters",
+        )
+    except Exception as e:
+        return _kube_api_error_handling(e)
+
+    for rc in rcs["items"]:
+        if rc["metadata"]["name"] == cluster_name:
+            return Cluster.from_k8_cluster_object(rc)
+    raise FileNotFoundError(
+        f"Cluster {cluster_name} is not found in {namespace} namespace"
+    )
+
+
 # private methods
 
 
diff --git a/tests/unit_test.py b/tests/unit_test.py
@@ -29,6 +29,7 @@
     list_all_clusters,
     list_all_queued,
     _copy_to_ray,
+    get_cluster,
     _app_wrapper_status,
     _ray_cluster_status,
 )
@@ -615,6 +616,7 @@ def get_ray_obj(group, version, namespace, plural, cls=None):
                         "appwrapper.mcad.ibm.com": "quicktest",
                         "controller-tools.k8s.io": "1.0",
                         "resourceName": "quicktest",
+                        "orderedinstance": "m4.xlarge_g4dn.xlarge",
                     },
                     "managedFields": [
                         {
@@ -792,10 +794,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None):
                     "workerGroupSpecs": [
                         {
                             "groupName": "small-group-quicktest",
-                            "maxReplicas": 1,
-                            "minReplicas": 1,
+                            "maxReplicas": 2,
+                            "minReplicas": 2,
                             "rayStartParams": {"block": "true", "num-gpus": "0"},
-                            "replicas": 1,
+                            "replicas": 2,
                             "template": {
                                 "metadata": {
                                     "annotations": {"key": "value"},
@@ -1530,6 +1532,30 @@ def get_aw_obj(group, version, namespace, plural):
     return api_obj1
 
 
+def test_get_cluster(mocker):
+    mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
+    mocker.patch(
+        "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+        side_effect=get_ray_obj,
+    )
+    cluster = get_cluster("quicktest")
+    cluster_config = cluster.config
+    assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns"
+    assert (
+        "m4.xlarge" in cluster_config.machine_types
+        and "g4dn.xlarge" in cluster_config.machine_types
+    )
+    assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1
+    assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2
+    assert cluster_config.gpu == 0
+    assert cluster_config.instascale
+    assert (
+        cluster_config.image
+        == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
+    )
+    assert cluster_config.min_worker == 2 and cluster_config.max_worker == 2
+
+
 def test_list_clusters(mocker, capsys):
     mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
     mocker.patch(