diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 88081a455..99b11582d 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -109,6 +109,9 @@ def create_app_wrapper(self): name = self.config.name namespace = self.config.namespace + head_cpus = self.config.head_cpus + head_memory = self.config.head_memory + head_gpus = self.config.head_gpus min_cpu = self.config.min_cpus max_cpu = self.config.max_cpus min_memory = self.config.min_memory @@ -126,6 +129,9 @@ def create_app_wrapper(self): return generate_appwrapper( name=name, namespace=namespace, + head_cpus=head_cpus, + head_memory=head_memory, + head_gpus=head_gpus, min_cpu=min_cpu, max_cpu=max_cpu, min_memory=min_memory, @@ -608,6 +614,15 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for namespace=rc["metadata"]["namespace"], dashboard=ray_route, + head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["cpu"], + head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["memory"], + head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["nvidia.com/gpu"], ) @@ -638,6 +653,9 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster: worker_gpu=cluster.config.num_gpus, namespace=cluster.config.namespace, dashboard=cluster.cluster_dashboard_uri(), + head_cpus=cluster.config.head_cpus, + head_mem=cluster.config.head_memory, + head_gpu=cluster.config.head_gpus, ) if ray.status == CodeFlareClusterStatus.READY: ray.status = RayClusterStatus.READY diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index cb935e79d..bde3f4ca0 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -34,6 +34,9 @@ class ClusterConfiguration: name: str namespace: str = None head_info: list = field(default_factory=list) + head_cpus: int = 2 + head_memory: int = 8 + head_gpus: int = 0 machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"] min_cpus: int = 1 max_cpus: int = 1 diff --git a/src/codeflare_sdk/cluster/model.py b/src/codeflare_sdk/cluster/model.py index 639cc7340..fb8873e46 100644 --- a/src/codeflare_sdk/cluster/model.py +++ b/src/codeflare_sdk/cluster/model.py @@ -69,6 +69,9 @@ class RayCluster: name: str status: RayClusterStatus + head_cpus: int + head_mem: str + head_gpu: int workers: int worker_mem_min: str worker_mem_max: str diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 54196fce8..95e1c5ecb 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -107,35 +107,51 @@ def update_priority(yaml, item, dispatch_priority, priority_val): def update_custompodresources( - item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers + item, + min_cpu, + max_cpu, + min_memory, + max_memory, + gpu, + workers, + head_cpus, + head_memory, + head_gpus, ): if "custompodresources" in item.keys(): custompodresources = item.get("custompodresources") for i in range(len(custompodresources)): + resource = custompodresources[i] if i == 0: # Leave head node resources as template default - continue - resource = custompodresources[i] - for k, v in resource.items(): - if k == "replicas" and i == 1: - resource[k] = workers - if k == "requests" or k == "limits": - for spec, _ in v.items(): - if spec == "cpu": - if k == "limits": - resource[k][spec] = max_cpu - else: - resource[k][spec] = min_cpu - if spec == "memory": - if k == "limits": - resource[k][spec] = str(max_memory) + "G" - else: - resource[k][spec] = str(min_memory) + "G" - if spec == "nvidia.com/gpu": - if i == 0: - resource[k][spec] = 0 - else: - resource[k][spec] = gpu + resource["requests"]["cpu"] = head_cpus + resource["limits"]["cpu"] = head_cpus + resource["requests"]["memory"] = str(head_memory) + "G" + resource["limits"]["memory"] = str(head_memory) + "G" + resource["requests"]["nvidia.com/gpu"] = head_gpus + resource["limits"]["nvidia.com/gpu"] = head_gpus + + else: + for k, v in resource.items(): + if k == "replicas" and i == 1: + resource[k] = workers + if k == "requests" or k == "limits": + for spec, _ in v.items(): + if spec == "cpu": + if k == "limits": + resource[k][spec] = max_cpu + else: + resource[k][spec] = min_cpu + if spec == "memory": + if k == "limits": + resource[k][spec] = str(max_memory) + "G" + else: + resource[k][spec] = str(min_memory) + "G" + if spec == "nvidia.com/gpu": + if i == 0: + resource[k][spec] = 0 + else: + resource[k][spec] = gpu else: sys.exit("Error: malformed template") @@ -205,11 +221,15 @@ def update_nodes( instascale, env, image_pull_secrets, + head_cpus, + head_memory, + head_gpus, ): if "generictemplate" in item.keys(): head = item.get("generictemplate").get("spec").get("headGroupSpec") - worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0] + head["rayStartParams"]["num-gpus"] = str(int(head_gpus)) + worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0] # Head counts as first worker worker["replicas"] = workers worker["minReplicas"] = workers @@ -225,7 +245,9 @@ def update_nodes( update_env(spec, env) if comp == head: # TODO: Eventually add head node configuration outside of template - continue + update_resources( + spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus + ) else: update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu) @@ -350,6 +372,9 @@ def write_user_appwrapper(user_yaml, output_file_name): def generate_appwrapper( name: str, namespace: str, + head_cpus: int, + head_memory: int, + head_gpus: int, min_cpu: int, max_cpu: int, min_memory: int, @@ -375,7 +400,16 @@ def generate_appwrapper( update_labels(user_yaml, instascale, instance_types) update_priority(user_yaml, item, dispatch_priority, priority_val) update_custompodresources( - item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers + item, + min_cpu, + max_cpu, + min_memory, + max_memory, + gpu, + workers, + head_cpus, + head_memory, + head_gpus, ) update_nodes( item, @@ -390,6 +424,9 @@ def generate_appwrapper( instascale, env, image_pull_secrets, + head_cpus, + head_memory, + head_gpus, ) update_dashboard_route(route_item, cluster_name, namespace) if local_interactive: diff --git a/tests/unit_test.py b/tests/unit_test.py index b046b1f13..4a8e2f441 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -525,6 +525,9 @@ def test_ray_details(mocker, capsys): worker_gpu=0, namespace="ns", dashboard="fake-uri", + head_cpus=2, + head_mem=8, + head_gpu=0, ) mocker.patch( "codeflare_sdk.cluster.cluster.Cluster.status", @@ -1685,6 +1688,9 @@ def test_cluster_status(mocker): worker_gpu=0, namespace="ns", dashboard="fake-uri", + head_cpus=2, + head_mem=8, + head_gpu=0, ) cf = Cluster(ClusterConfiguration(name="test", namespace="ns")) mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None)