diff --git a/docs/cluster/cluster.html b/docs/cluster/cluster.html index 59294e774..79916c47e 100644 --- a/docs/cluster/cluster.html +++ b/docs/cluster/cluster.html @@ -95,6 +95,14 @@

Module codeflare_sdk.cluster.cluster

Called upon cluster object creation, creates an AppWrapper yaml based on the specifications of the ClusterConfiguration. """ + + if self.config.namespace is None: + self.config.namespace = oc.get_project_name() + if type(self.config.namespace) is not str: + raise TypeError( + f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication." + ) + name = self.config.name namespace = self.config.namespace min_cpu = self.config.min_cpus @@ -317,26 +325,6 @@

Module codeflare_sdk.cluster.cluster

return to_return -def get_current_namespace() -> str: - """ - Returns the user's current working namespace. - """ - try: - namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip() - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if ( - "do not have rights" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise PermissionError( - "Action not permitted, have you run auth.login() or cluster.up()?" - ) - else: - raise osp - return namespace - - def list_all_clusters(namespace: str, print_to_console: bool = True): """ Returns (and prints by default) a list of all clusters in a given namespace. @@ -537,35 +525,6 @@

Module codeflare_sdk.cluster.cluster

Functions

-
-def get_current_namespace() ‑> str -
-
-

Returns the user's current working namespace.

-
- -Expand source code - -
def get_current_namespace() -> str:
-    """
-    Returns the user's current working namespace.
-    """
-    try:
-        namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip()
-    except oc.OpenShiftPythonException as osp:  # pragma: no cover
-        error_msg = osp.result.err()
-        if (
-            "do not have rights" in error_msg
-            or "Missing or incomplete configuration" in error_msg
-        ):
-            raise PermissionError(
-                "Action not permitted, have you run auth.login() or cluster.up()?"
-            )
-        else:
-            raise osp
-    return namespace
-
-
def list_all_clusters(namespace: str, print_to_console: bool = True)
@@ -655,6 +614,14 @@

Classes

Called upon cluster object creation, creates an AppWrapper yaml based on the specifications of the ClusterConfiguration. """ + + if self.config.namespace is None: + self.config.namespace = oc.get_project_name() + if type(self.config.namespace) is not str: + raise TypeError( + f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication." + ) + name = self.config.name namespace = self.config.namespace min_cpu = self.config.min_cpus @@ -942,6 +909,14 @@

Methods

Called upon cluster object creation, creates an AppWrapper yaml based on the specifications of the ClusterConfiguration. """ + + if self.config.namespace is None: + self.config.namespace = oc.get_project_name() + if type(self.config.namespace) is not str: + raise TypeError( + f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication." + ) + name = self.config.name namespace = self.config.namespace min_cpu = self.config.min_cpus @@ -1253,7 +1228,6 @@

Index

  • Functions

    diff --git a/docs/cluster/config.html b/docs/cluster/config.html index a62eb0e6d..559876329 100644 --- a/docs/cluster/config.html +++ b/docs/cluster/config.html @@ -53,6 +53,7 @@

    Module codeflare_sdk.cluster.config

    from dataclasses import dataclass, field from .auth import Authentication import pathlib +import openshift dir = pathlib.Path(__file__).parent.parent.resolve() @@ -65,7 +66,7 @@

    Module codeflare_sdk.cluster.config

    """ name: str - namespace: str = "default" + namespace: str = None head_info: list = field(default_factory=list) machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"] min_cpus: int = 1 @@ -92,7 +93,7 @@

    Classes

    class ClusterConfiguration -(name: str, namespace: str = 'default', head_info: list = <factory>, machine_types: list = <factory>, min_cpus: int = 1, max_cpus: int = 1, min_worker: int = 1, max_worker: int = 1, min_memory: int = 2, max_memory: int = 2, gpu: int = 0, template: str = '/home/meyceoz/Documents/codeflare-sdk/src/codeflare_sdk/templates/new-template.yaml', instascale: bool = False, envs: dict = <factory>, image: str = 'ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103') +(name: str, namespace: str = None, head_info: list = <factory>, machine_types: list = <factory>, min_cpus: int = 1, max_cpus: int = 1, min_worker: int = 1, max_worker: int = 1, min_memory: int = 2, max_memory: int = 2, gpu: int = 0, template: str = '/home/meyceoz/Documents/codeflare-sdk/src/codeflare_sdk/templates/new-template.yaml', instascale: bool = False, envs: dict = <factory>, image: str = 'ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103')

    This dataclass is used to specify resource requirements and other details, and @@ -108,7 +109,7 @@

    Classes

    """ name: str - namespace: str = "default" + namespace: str = None head_info: list = field(default_factory=list) machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"] min_cpus: int = 1 diff --git a/docs/job/jobs.html b/docs/job/jobs.html index c851563ba..366ab862d 100644 --- a/docs/job/jobs.html +++ b/docs/job/jobs.html @@ -45,6 +45,7 @@

    Module codeflare_sdk.job.jobs

    from typing import TYPE_CHECKING, Optional, Dict, List from pathlib import Path +import openshift as oc from torchx.components.dist import ddp from torchx.runner import get_runner from torchx.specs import AppHandle, parse_app_handle, AppDryRunInfo @@ -88,8 +89,10 @@

    Module codeflare_sdk.job.jobs

    max_retries: int = 0, mounts: Optional[List[str]] = None, rdzv_port: int = 29500, + rdzv_backend: str = None, scheduler_args: Optional[Dict[str, str]] = None, image: Optional[str] = None, + workspace: Optional[str] = f"file://{Path.cwd()}", ): if bool(script) == bool(m): # logical XOR raise ValueError( @@ -108,10 +111,12 @@

    Module codeflare_sdk.job.jobs

    self.max_retries = max_retries self.mounts: List[str] = mounts if mounts is not None else [] self.rdzv_port = rdzv_port + self.rdzv_backend = rdzv_backend self.scheduler_args: Dict[str, str] = ( scheduler_args if scheduler_args is not None else dict() ) self.image = image + self.workspace = workspace def _dry_run(self, cluster: "Cluster"): j = f"{cluster.config.max_worker}x{max(cluster.config.gpu, 1)}" # # of proc. = # of gpus @@ -131,17 +136,23 @@

    Module codeflare_sdk.job.jobs

    env=self.env, max_retries=self.max_retries, rdzv_port=self.rdzv_port, + rdzv_backend=self.rdzv_backend + if self.rdzv_backend is not None + else "static", mounts=self.mounts, ), scheduler=cluster.torchx_scheduler, cfg=cluster.torchx_config(**self.scheduler_args), - workspace=f"file://{Path.cwd()}", + workspace=self.workspace, ) def _missing_spec(self, spec: str): raise ValueError(f"Job definition missing arg: {spec}") def _dry_run_no_cluster(self): + if self.scheduler_args is not None: + if self.scheduler_args.get("namespace") is None: + self.scheduler_args["namespace"] = oc.get_project_name() return torchx_runner.dryrun( app=ddp( *self.script_args, @@ -166,13 +177,16 @@

    Module codeflare_sdk.job.jobs

    env=self.env, # should this still exist? max_retries=self.max_retries, rdzv_port=self.rdzv_port, # should this still exist? + rdzv_backend=self.rdzv_backend + if self.rdzv_backend is not None + else "c10d", mounts=self.mounts, image=self.image if self.image is not None else self._missing_spec("image"), ), scheduler="kubernetes_mcad", - cfg=self.scheduler_args if self.scheduler_args is not None else None, + cfg=self.scheduler_args, workspace="", ) @@ -291,7 +305,7 @@

    Methods

    class DDPJobDefinition -(script: Optional[str] = None, m: Optional[str] = None, script_args: Optional[List[str]] = None, name: Optional[str] = None, cpu: Optional[int] = None, gpu: Optional[int] = None, memMB: Optional[int] = None, h: Optional[str] = None, j: Optional[str] = None, env: Optional[Dict[str, str]] = None, max_retries: int = 0, mounts: Optional[List[str]] = None, rdzv_port: int = 29500, scheduler_args: Optional[Dict[str, str]] = None, image: Optional[str] = None) +(script: Optional[str] = None, m: Optional[str] = None, script_args: Optional[List[str]] = None, name: Optional[str] = None, cpu: Optional[int] = None, gpu: Optional[int] = None, memMB: Optional[int] = None, h: Optional[str] = None, j: Optional[str] = None, env: Optional[Dict[str, str]] = None, max_retries: int = 0, mounts: Optional[List[str]] = None, rdzv_port: int = 29500, rdzv_backend: str = None, scheduler_args: Optional[Dict[str, str]] = None, image: Optional[str] = None, workspace: Optional[str] = 'file:///home/meyceoz/Documents/codeflare-sdk')
    @@ -315,8 +329,10 @@

    Methods

    max_retries: int = 0, mounts: Optional[List[str]] = None, rdzv_port: int = 29500, + rdzv_backend: str = None, scheduler_args: Optional[Dict[str, str]] = None, image: Optional[str] = None, + workspace: Optional[str] = f"file://{Path.cwd()}", ): if bool(script) == bool(m): # logical XOR raise ValueError( @@ -335,10 +351,12 @@

    Methods

    self.max_retries = max_retries self.mounts: List[str] = mounts if mounts is not None else [] self.rdzv_port = rdzv_port + self.rdzv_backend = rdzv_backend self.scheduler_args: Dict[str, str] = ( scheduler_args if scheduler_args is not None else dict() ) self.image = image + self.workspace = workspace def _dry_run(self, cluster: "Cluster"): j = f"{cluster.config.max_worker}x{max(cluster.config.gpu, 1)}" # # of proc. = # of gpus @@ -358,17 +376,23 @@

    Methods

    env=self.env, max_retries=self.max_retries, rdzv_port=self.rdzv_port, + rdzv_backend=self.rdzv_backend + if self.rdzv_backend is not None + else "static", mounts=self.mounts, ), scheduler=cluster.torchx_scheduler, cfg=cluster.torchx_config(**self.scheduler_args), - workspace=f"file://{Path.cwd()}", + workspace=self.workspace, ) def _missing_spec(self, spec: str): raise ValueError(f"Job definition missing arg: {spec}") def _dry_run_no_cluster(self): + if self.scheduler_args is not None: + if self.scheduler_args.get("namespace") is None: + self.scheduler_args["namespace"] = oc.get_project_name() return torchx_runner.dryrun( app=ddp( *self.script_args, @@ -393,13 +417,16 @@

    Methods

    env=self.env, # should this still exist? max_retries=self.max_retries, rdzv_port=self.rdzv_port, # should this still exist? + rdzv_backend=self.rdzv_backend + if self.rdzv_backend is not None + else "c10d", mounts=self.mounts, image=self.image if self.image is not None else self._missing_spec("image"), ), scheduler="kubernetes_mcad", - cfg=self.scheduler_args if self.scheduler_args is not None else None, + cfg=self.scheduler_args, workspace="", ) diff --git a/pyproject.toml b/pyproject.toml index 72bef2f6e..223569db3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "codeflare-sdk" -version = "0.4.1" +version = "0.4.2" description = "Python SDK for codeflare client" license = "Apache-2.0" @@ -25,4 +25,4 @@ openshift-client = "1.0.18" rich = "^12.5" ray = {version = "2.1.0", extras = ["default"]} kubernetes = "26.1.0" -codeflare-torchx = "0.5.0.dev5" +codeflare-torchx = "0.6.0.dev0"