Skip to content

Commit 4808aca

Browse files
make rdzv backend defaults reasonable
1 parent af6fe02 commit 4808aca

File tree

2 files changed

+8
-4
lines changed

2 files changed

+8
-4
lines changed

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@ openshift-client==1.0.18
22
rich==12.5.1
33
ray[default]==2.1.0
44
kubernetes==26.1.0
5-
codeflare-torchx==0.5.0.dev5
5+
codeflare-torchx==0.6.0.dev0

src/codeflare_sdk/job/jobs.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def __init__(
6161
max_retries: int = 0,
6262
mounts: Optional[List[str]] = None,
6363
rdzv_port: int = 29500,
64-
rdzv_backend: str = "c10d",
64+
rdzv_backend: str = None,
6565
scheduler_args: Optional[Dict[str, str]] = None,
6666
image: Optional[str] = None,
6767
):
@@ -106,7 +106,9 @@ def _dry_run(self, cluster: "Cluster"):
106106
env=self.env,
107107
max_retries=self.max_retries,
108108
rdzv_port=self.rdzv_port,
109-
rdzv_backend=self.rdzv_backend,
109+
rdzv_backend=self.rdzv_backend
110+
if self.rdzv_backend is not None
111+
else "static",
110112
mounts=self.mounts,
111113
),
112114
scheduler=cluster.torchx_scheduler,
@@ -145,7 +147,9 @@ def _dry_run_no_cluster(self):
145147
env=self.env, # should this still exist?
146148
max_retries=self.max_retries,
147149
rdzv_port=self.rdzv_port, # should this still exist?
148-
rdzv_backend=self.rdzv_backend,
150+
rdzv_backend=self.rdzv_backend
151+
if self.rdzv_backend is not None
152+
else "c10d",
149153
mounts=self.mounts,
150154
image=self.image
151155
if self.image is not None

0 commit comments

Comments
 (0)