Skip to content

Update for 0.4.2 release #90

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 24 additions & 50 deletions docs/cluster/cluster.html
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,14 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
Called upon cluster object creation, creates an AppWrapper yaml based on
the specifications of the ClusterConfiguration.
&#34;&#34;&#34;

if self.config.namespace is None:
self.config.namespace = oc.get_project_name()
if type(self.config.namespace) is not str:
raise TypeError(
f&#34;Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication.&#34;
)

name = self.config.name
namespace = self.config.namespace
min_cpu = self.config.min_cpus
Expand Down Expand Up @@ -317,26 +325,6 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
return to_return


def get_current_namespace() -&gt; str:
&#34;&#34;&#34;
Returns the user&#39;s current working namespace.
&#34;&#34;&#34;
try:
namespace = oc.invoke(&#34;project&#34;, [&#34;-q&#34;]).actions()[0].out.strip()
except oc.OpenShiftPythonException as osp: # pragma: no cover
error_msg = osp.result.err()
if (
&#34;do not have rights&#34; in error_msg
or &#34;Missing or incomplete configuration&#34; in error_msg
):
raise PermissionError(
&#34;Action not permitted, have you run auth.login() or cluster.up()?&#34;
)
else:
raise osp
return namespace


def list_all_clusters(namespace: str, print_to_console: bool = True):
&#34;&#34;&#34;
Returns (and prints by default) a list of all clusters in a given namespace.
Expand Down Expand Up @@ -537,35 +525,6 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="codeflare_sdk.cluster.cluster.get_current_namespace"><code class="name flex">
<span>def <span class="ident">get_current_namespace</span></span>(<span>) ‑> str</span>
</code></dt>
<dd>
<div class="desc"><p>Returns the user's current working namespace.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_current_namespace() -&gt; str:
&#34;&#34;&#34;
Returns the user&#39;s current working namespace.
&#34;&#34;&#34;
try:
namespace = oc.invoke(&#34;project&#34;, [&#34;-q&#34;]).actions()[0].out.strip()
except oc.OpenShiftPythonException as osp: # pragma: no cover
error_msg = osp.result.err()
if (
&#34;do not have rights&#34; in error_msg
or &#34;Missing or incomplete configuration&#34; in error_msg
):
raise PermissionError(
&#34;Action not permitted, have you run auth.login() or cluster.up()?&#34;
)
else:
raise osp
return namespace</code></pre>
</details>
</dd>
<dt id="codeflare_sdk.cluster.cluster.list_all_clusters"><code class="name flex">
<span>def <span class="ident">list_all_clusters</span></span>(<span>namespace: str, print_to_console: bool = True)</span>
</code></dt>
Expand Down Expand Up @@ -655,6 +614,14 @@ <h2 class="section-title" id="header-classes">Classes</h2>
Called upon cluster object creation, creates an AppWrapper yaml based on
the specifications of the ClusterConfiguration.
&#34;&#34;&#34;

if self.config.namespace is None:
self.config.namespace = oc.get_project_name()
if type(self.config.namespace) is not str:
raise TypeError(
f&#34;Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication.&#34;
)

name = self.config.name
namespace = self.config.namespace
min_cpu = self.config.min_cpus
Expand Down Expand Up @@ -942,6 +909,14 @@ <h3>Methods</h3>
Called upon cluster object creation, creates an AppWrapper yaml based on
the specifications of the ClusterConfiguration.
&#34;&#34;&#34;

if self.config.namespace is None:
self.config.namespace = oc.get_project_name()
if type(self.config.namespace) is not str:
raise TypeError(
f&#34;Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication.&#34;
)

name = self.config.name
namespace = self.config.namespace
min_cpu = self.config.min_cpus
Expand Down Expand Up @@ -1253,7 +1228,6 @@ <h1>Index</h1>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="codeflare_sdk.cluster.cluster.get_current_namespace" href="#codeflare_sdk.cluster.cluster.get_current_namespace">get_current_namespace</a></code></li>
<li><code><a title="codeflare_sdk.cluster.cluster.list_all_clusters" href="#codeflare_sdk.cluster.cluster.list_all_clusters">list_all_clusters</a></code></li>
<li><code><a title="codeflare_sdk.cluster.cluster.list_all_queued" href="#codeflare_sdk.cluster.cluster.list_all_queued">list_all_queued</a></code></li>
</ul>
Expand Down
7 changes: 4 additions & 3 deletions docs/cluster/config.html
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.config</code></h1>
from dataclasses import dataclass, field
from .auth import Authentication
import pathlib
import openshift

dir = pathlib.Path(__file__).parent.parent.resolve()

Expand All @@ -65,7 +66,7 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.config</code></h1>
&#34;&#34;&#34;

name: str
namespace: str = &#34;default&#34;
namespace: str = None
head_info: list = field(default_factory=list)
machine_types: list = field(default_factory=list) # [&#34;m4.xlarge&#34;, &#34;g4dn.xlarge&#34;]
min_cpus: int = 1
Expand All @@ -92,7 +93,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
<dl>
<dt id="codeflare_sdk.cluster.config.ClusterConfiguration"><code class="flex name class">
<span>class <span class="ident">ClusterConfiguration</span></span>
<span>(</span><span>name: str, namespace: str = 'default', head_info: list = &lt;factory&gt;, machine_types: list = &lt;factory&gt;, min_cpus: int = 1, max_cpus: int = 1, min_worker: int = 1, max_worker: int = 1, min_memory: int = 2, max_memory: int = 2, gpu: int = 0, template: str = '/home/meyceoz/Documents/codeflare-sdk/src/codeflare_sdk/templates/new-template.yaml', instascale: bool = False, envs: dict = &lt;factory&gt;, image: str = 'ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103')</span>
<span>(</span><span>name: str, namespace: str = None, head_info: list = &lt;factory&gt;, machine_types: list = &lt;factory&gt;, min_cpus: int = 1, max_cpus: int = 1, min_worker: int = 1, max_worker: int = 1, min_memory: int = 2, max_memory: int = 2, gpu: int = 0, template: str = '/home/meyceoz/Documents/codeflare-sdk/src/codeflare_sdk/templates/new-template.yaml', instascale: bool = False, envs: dict = &lt;factory&gt;, image: str = 'ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103')</span>
</code></dt>
<dd>
<div class="desc"><p>This dataclass is used to specify resource requirements and other details, and
Expand All @@ -108,7 +109,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
&#34;&#34;&#34;

name: str
namespace: str = &#34;default&#34;
namespace: str = None
head_info: list = field(default_factory=list)
machine_types: list = field(default_factory=list) # [&#34;m4.xlarge&#34;, &#34;g4dn.xlarge&#34;]
min_cpus: int = 1
Expand Down
37 changes: 32 additions & 5 deletions docs/job/jobs.html
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
from typing import TYPE_CHECKING, Optional, Dict, List
from pathlib import Path

import openshift as oc
from torchx.components.dist import ddp
from torchx.runner import get_runner
from torchx.specs import AppHandle, parse_app_handle, AppDryRunInfo
Expand Down Expand Up @@ -88,8 +89,10 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
max_retries: int = 0,
mounts: Optional[List[str]] = None,
rdzv_port: int = 29500,
rdzv_backend: str = None,
scheduler_args: Optional[Dict[str, str]] = None,
image: Optional[str] = None,
workspace: Optional[str] = f&#34;file://{Path.cwd()}&#34;,
):
if bool(script) == bool(m): # logical XOR
raise ValueError(
Expand All @@ -108,10 +111,12 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
self.max_retries = max_retries
self.mounts: List[str] = mounts if mounts is not None else []
self.rdzv_port = rdzv_port
self.rdzv_backend = rdzv_backend
self.scheduler_args: Dict[str, str] = (
scheduler_args if scheduler_args is not None else dict()
)
self.image = image
self.workspace = workspace

def _dry_run(self, cluster: &#34;Cluster&#34;):
j = f&#34;{cluster.config.max_worker}x{max(cluster.config.gpu, 1)}&#34; # # of proc. = # of gpus
Expand All @@ -131,17 +136,23 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
env=self.env,
max_retries=self.max_retries,
rdzv_port=self.rdzv_port,
rdzv_backend=self.rdzv_backend
if self.rdzv_backend is not None
else &#34;static&#34;,
mounts=self.mounts,
),
scheduler=cluster.torchx_scheduler,
cfg=cluster.torchx_config(**self.scheduler_args),
workspace=f&#34;file://{Path.cwd()}&#34;,
workspace=self.workspace,
)

def _missing_spec(self, spec: str):
raise ValueError(f&#34;Job definition missing arg: {spec}&#34;)

def _dry_run_no_cluster(self):
if self.scheduler_args is not None:
if self.scheduler_args.get(&#34;namespace&#34;) is None:
self.scheduler_args[&#34;namespace&#34;] = oc.get_project_name()
return torchx_runner.dryrun(
app=ddp(
*self.script_args,
Expand All @@ -166,13 +177,16 @@ <h1 class="title">Module <code>codeflare_sdk.job.jobs</code></h1>
env=self.env, # should this still exist?
max_retries=self.max_retries,
rdzv_port=self.rdzv_port, # should this still exist?
rdzv_backend=self.rdzv_backend
if self.rdzv_backend is not None
else &#34;c10d&#34;,
mounts=self.mounts,
image=self.image
if self.image is not None
else self._missing_spec(&#34;image&#34;),
),
scheduler=&#34;kubernetes_mcad&#34;,
cfg=self.scheduler_args if self.scheduler_args is not None else None,
cfg=self.scheduler_args,
workspace=&#34;&#34;,
)

Expand Down Expand Up @@ -291,7 +305,7 @@ <h3>Methods</h3>
</dd>
<dt id="codeflare_sdk.job.jobs.DDPJobDefinition"><code class="flex name class">
<span>class <span class="ident">DDPJobDefinition</span></span>
<span>(</span><span>script: Optional[str] = None, m: Optional[str] = None, script_args: Optional[List[str]] = None, name: Optional[str] = None, cpu: Optional[int] = None, gpu: Optional[int] = None, memMB: Optional[int] = None, h: Optional[str] = None, j: Optional[str] = None, env: Optional[Dict[str, str]] = None, max_retries: int = 0, mounts: Optional[List[str]] = None, rdzv_port: int = 29500, scheduler_args: Optional[Dict[str, str]] = None, image: Optional[str] = None)</span>
<span>(</span><span>script: Optional[str] = None, m: Optional[str] = None, script_args: Optional[List[str]] = None, name: Optional[str] = None, cpu: Optional[int] = None, gpu: Optional[int] = None, memMB: Optional[int] = None, h: Optional[str] = None, j: Optional[str] = None, env: Optional[Dict[str, str]] = None, max_retries: int = 0, mounts: Optional[List[str]] = None, rdzv_port: int = 29500, rdzv_backend: str = None, scheduler_args: Optional[Dict[str, str]] = None, image: Optional[str] = None, workspace: Optional[str] = 'file:///home/meyceoz/Documents/codeflare-sdk')</span>
</code></dt>
<dd>
<div class="desc"></div>
Expand All @@ -315,8 +329,10 @@ <h3>Methods</h3>
max_retries: int = 0,
mounts: Optional[List[str]] = None,
rdzv_port: int = 29500,
rdzv_backend: str = None,
scheduler_args: Optional[Dict[str, str]] = None,
image: Optional[str] = None,
workspace: Optional[str] = f&#34;file://{Path.cwd()}&#34;,
):
if bool(script) == bool(m): # logical XOR
raise ValueError(
Expand All @@ -335,10 +351,12 @@ <h3>Methods</h3>
self.max_retries = max_retries
self.mounts: List[str] = mounts if mounts is not None else []
self.rdzv_port = rdzv_port
self.rdzv_backend = rdzv_backend
self.scheduler_args: Dict[str, str] = (
scheduler_args if scheduler_args is not None else dict()
)
self.image = image
self.workspace = workspace

def _dry_run(self, cluster: &#34;Cluster&#34;):
j = f&#34;{cluster.config.max_worker}x{max(cluster.config.gpu, 1)}&#34; # # of proc. = # of gpus
Expand All @@ -358,17 +376,23 @@ <h3>Methods</h3>
env=self.env,
max_retries=self.max_retries,
rdzv_port=self.rdzv_port,
rdzv_backend=self.rdzv_backend
if self.rdzv_backend is not None
else &#34;static&#34;,
mounts=self.mounts,
),
scheduler=cluster.torchx_scheduler,
cfg=cluster.torchx_config(**self.scheduler_args),
workspace=f&#34;file://{Path.cwd()}&#34;,
workspace=self.workspace,
)

def _missing_spec(self, spec: str):
raise ValueError(f&#34;Job definition missing arg: {spec}&#34;)

def _dry_run_no_cluster(self):
if self.scheduler_args is not None:
if self.scheduler_args.get(&#34;namespace&#34;) is None:
self.scheduler_args[&#34;namespace&#34;] = oc.get_project_name()
return torchx_runner.dryrun(
app=ddp(
*self.script_args,
Expand All @@ -393,13 +417,16 @@ <h3>Methods</h3>
env=self.env, # should this still exist?
max_retries=self.max_retries,
rdzv_port=self.rdzv_port, # should this still exist?
rdzv_backend=self.rdzv_backend
if self.rdzv_backend is not None
else &#34;c10d&#34;,
mounts=self.mounts,
image=self.image
if self.image is not None
else self._missing_spec(&#34;image&#34;),
),
scheduler=&#34;kubernetes_mcad&#34;,
cfg=self.scheduler_args if self.scheduler_args is not None else None,
cfg=self.scheduler_args,
workspace=&#34;&#34;,
)

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "codeflare-sdk"
version = "0.4.1"
version = "0.4.2"
description = "Python SDK for codeflare client"

license = "Apache-2.0"
Expand All @@ -25,4 +25,4 @@ openshift-client = "1.0.18"
rich = "^12.5"
ray = {version = "2.1.0", extras = ["default"]}
kubernetes = "26.1.0"
codeflare-torchx = "0.5.0.dev5"
codeflare-torchx = "0.6.0.dev0"