diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index d698331e6..b0bff0883 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -61,6 +61,39 @@ def __init__(self, config: ClusterConfiguration): self.app_wrapper_yaml = self.create_app_wrapper() self.app_wrapper_name = self.app_wrapper_yaml.split(".")[0] + def evaluate_config(self): + if not self.evaluate_dispatch_priority(): + return False + else: + return True + + def evaluate_dispatch_priority(self): + priority_class = self.config.dispatch_priority + if priority_class is None: + return True + else: + try: + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + priority_classes = api_instance.list_cluster_custom_object( + group="scheduling.k8s.io", + version="v1", + plural="priorityclasses", + ) + available_priority_classes = [ + i["metadata"]["name"] for i in priority_classes["items"] + ] + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) + + if priority_class in available_priority_classes: + return True + else: + print( + f"Priority class {priority_class} is not available in the cluster" + ) + return False + def create_app_wrapper(self): """ Called upon cluster object creation, creates an AppWrapper yaml based on @@ -91,6 +124,7 @@ def create_app_wrapper(self): env = self.config.envs local_interactive = self.config.local_interactive image_pull_secrets = self.config.image_pull_secrets + dispatch_priority = self.config.dispatch_priority return generate_appwrapper( name=name, namespace=namespace, @@ -107,6 +141,7 @@ def create_app_wrapper(self): env=env, local_interactive=local_interactive, image_pull_secrets=image_pull_secrets, + dispatch_priority=dispatch_priority, ) # creates a new cluster with the provided or default spec @@ -115,6 +150,12 @@ def up(self): Applies the AppWrapper yaml, pushing the resource request onto the MCAD queue. """ + + # Before attempting to bring up the cluster let's evaluate the ClusterConfig + if not self.evaluate_config(): + print("Invalid Cluster Configuration") + return False + namespace = self.config.namespace try: config_check() diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index aed3674eb..cb935e79d 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -47,3 +47,4 @@ class ClusterConfiguration: image: str = "quay.io/project-codeflare/ray:2.5.0-py38-cu116" local_interactive: bool = False image_pull_secrets: list = field(default_factory=list) + dispatch_priority: str = None diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml index 56014a068..386dd86bf 100644 --- a/src/codeflare_sdk/templates/base-template.yaml +++ b/src/codeflare_sdk/templates/base-template.yaml @@ -8,6 +8,8 @@ metadata: orderedinstance: "m4.xlarge_g4dn.xlarge" spec: priority: 9 + schedulingSpec: + minAvailable: 3 resources: Items: [] GenericItems: diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 9c5804cd5..697122a2c 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -89,6 +89,14 @@ def update_labels(yaml, instascale, instance_types): metadata.pop("labels") +def update_priority(item, dispatch_priority): + if dispatch_priority is not None: + head = item.get("generictemplate").get("spec").get("headGroupSpec") + worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0] + head["template"]["spec"]["priorityClassName"] = dispatch_priority + worker["template"]["spec"]["priorityClassName"] = dispatch_priority + + def update_custompodresources( item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers ): @@ -175,6 +183,11 @@ def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu): limits["nvidia.com/gpu"] = gpu +def update_scheduling_spec(yaml, workers): + spec = yaml.get("spec") + spec["schedulingSpec"]["minAvailable"] = workers + 1 + + def update_nodes( item, appwrapper_name, @@ -346,6 +359,7 @@ def generate_appwrapper( env, local_interactive: bool, image_pull_secrets: list, + dispatch_priority: str, ): user_yaml = read_template(template) appwrapper_name, cluster_name = gen_names(name) @@ -354,6 +368,8 @@ def generate_appwrapper( route_item = resources["resources"].get("GenericItems")[1] update_names(user_yaml, item, appwrapper_name, cluster_name, namespace) update_labels(user_yaml, instascale, instance_types) + update_priority(item, dispatch_priority) + update_scheduling_spec(user_yaml, workers) update_custompodresources( item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers ) diff --git a/tests/test-case.yaml b/tests/test-case.yaml index da0845ed9..df88e7bb0 100644 --- a/tests/test-case.yaml +++ b/tests/test-case.yaml @@ -109,6 +109,7 @@ spec: nvidia.com/gpu: 0 imagePullSecrets: - name: unit-test-pull-secret + priorityClassName: default rayVersion: 2.5.0 workerGroupSpecs: - groupName: small-group-unit-test-cluster @@ -176,6 +177,7 @@ spec: do echo waiting for myservice; sleep 2; done image: busybox:1.28 name: init-myservice + priorityClassName: default replicas: 1 - generictemplate: apiVersion: route.openshift.io/v1 @@ -193,3 +195,5 @@ spec: name: unit-test-cluster-head-svc replicas: 1 Items: [] + schedulingSpec: + minAvailable: 3 diff --git a/tests/unit_test.py b/tests/unit_test.py index 58f55bfa1..c518fa310 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -228,6 +228,7 @@ def test_config_creation(): instascale=True, machine_types=["cpu.small", "gpu.large"], image_pull_secrets=["unit-test-pull-secret"], + dispatch_priority="default", ) assert config.name == "unit-test-cluster" and config.namespace == "ns" @@ -240,6 +241,7 @@ def test_config_creation(): assert config.instascale assert config.machine_types == ["cpu.small", "gpu.large"] assert config.image_pull_secrets == ["unit-test-pull-secret"] + assert config.dispatch_priority == "default" return config @@ -300,6 +302,10 @@ def test_cluster_up_down(mocker): "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object", side_effect=arg_check_del_effect, ) + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_cluster_custom_object", + return_value={"items": []}, + ) cluster = test_cluster_creation() cluster.up() cluster.down()