Skip to content

Commit 035e935

Browse files
no-jira: don't allow ttl_seconds_after_finished for longlived clusters
1 parent dff72be commit 035e935

File tree

2 files changed

+52
-5
lines changed

2 files changed

+52
-5
lines changed

src/codeflare_sdk/ray/rayjobs/rayjob.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,15 @@ def __init__(
118118
"to specify which existing cluster to use."
119119
)
120120

121+
if cluster_name is not None and ttl_seconds_after_finished != 0:
122+
raise ValueError(
123+
"❌ Configuration Error: 'ttl_seconds_after_finished' cannot be set when targeting "
124+
"an existing cluster (via 'cluster_name').\n"
125+
"TTL controls automatic cleanup of RayJob-managed clusters, which only applies "
126+
"when creating a new cluster via 'cluster_config'.\n"
127+
"For existing clusters, the RayJob CR will remain after completion for inspection."
128+
)
129+
121130
self.name = job_name
122131
self.entrypoint = entrypoint
123132

src/codeflare_sdk/ray/rayjobs/test/test_rayjob.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,42 @@ def test_rayjob_init_validation_neither_provided(auto_mock_setup):
105105
RayJob(job_name="test-job", entrypoint="python test.py")
106106

107107

108+
def test_rayjob_init_validation_ttl_with_existing_cluster(auto_mock_setup):
109+
"""
110+
Test that providing ttl_seconds_after_finished with cluster_name raises error.
111+
TTL can only be set when creating a new cluster via cluster_config.
112+
"""
113+
with pytest.raises(
114+
ValueError,
115+
match="❌ Configuration Error: 'ttl_seconds_after_finished' cannot be set when targeting "
116+
"an existing cluster \\(via 'cluster_name'\\)",
117+
):
118+
RayJob(
119+
job_name="test-job",
120+
cluster_name="existing-cluster",
121+
entrypoint="python test.py",
122+
ttl_seconds_after_finished=300,
123+
)
124+
125+
126+
def test_rayjob_init_ttl_zero_with_existing_cluster_allowed(auto_mock_setup):
127+
"""
128+
Test that ttl_seconds_after_finished=0 is allowed with cluster_name.
129+
The validation only checks for non-zero TTL values.
130+
"""
131+
rayjob = RayJob(
132+
job_name="test-job",
133+
cluster_name="existing-cluster",
134+
entrypoint="python test.py",
135+
ttl_seconds_after_finished=0,
136+
namespace="test-namespace",
137+
)
138+
139+
assert rayjob.name == "test-job"
140+
assert rayjob.cluster_name == "existing-cluster"
141+
assert rayjob.ttl_seconds_after_finished == 0
142+
143+
108144
def test_rayjob_init_with_cluster_config(auto_mock_setup):
109145
"""
110146
Test RayJob initialization with cluster configuration for auto-creation.
@@ -245,7 +281,6 @@ def test_build_rayjob_cr_with_existing_cluster(auto_mock_setup):
245281
cluster_name="existing-cluster",
246282
namespace="test-namespace",
247283
entrypoint="python main.py",
248-
ttl_seconds_after_finished=300,
249284
)
250285

251286
rayjob_cr = rayjob._build_rayjob_cr()
@@ -256,7 +291,6 @@ def test_build_rayjob_cr_with_existing_cluster(auto_mock_setup):
256291
spec = rayjob_cr["spec"]
257292
assert spec["entrypoint"] == "python main.py"
258293
assert spec["shutdownAfterJobFinishes"] is False
259-
assert spec["ttlSecondsAfterFinished"] == 300
260294

261295
assert spec["clusterSelector"]["ray.io/cluster"] == "existing-cluster"
262296
assert "rayClusterSpec" not in spec
@@ -526,12 +560,14 @@ def test_rayjob_with_runtime_env_dict(auto_mock_setup):
526560
def test_rayjob_with_active_deadline_and_ttl(auto_mock_setup):
527561
"""
528562
Test RayJob with both active deadline and TTL settings.
563+
Note: TTL can only be set when creating a new cluster (via cluster_config).
529564
"""
530565

566+
cluster_config = ManagedClusterConfig()
531567
rayjob = RayJob(
532568
job_name="test-job",
533569
entrypoint="python -c 'print()'",
534-
cluster_name="test-cluster",
570+
cluster_config=cluster_config,
535571
active_deadline_seconds=300,
536572
ttl_seconds_after_finished=600,
537573
namespace="test-namespace",
@@ -594,11 +630,13 @@ def test_rayjob_error_handling_invalid_cluster_config(auto_mock_setup):
594630
def test_rayjob_constructor_parameter_validation(auto_mock_setup):
595631
"""
596632
Test constructor parameter validation.
633+
Note: TTL can only be set when creating a new cluster (via cluster_config).
597634
"""
635+
cluster_config = ManagedClusterConfig()
598636
rayjob = RayJob(
599637
job_name="test-job",
600638
entrypoint="python -c 'print()'",
601-
cluster_name="test-cluster",
639+
cluster_config=cluster_config,
602640
namespace="test-ns",
603641
runtime_env=RuntimeEnv(pip=["numpy"]),
604642
ttl_seconds_after_finished=300,
@@ -607,7 +645,7 @@ def test_rayjob_constructor_parameter_validation(auto_mock_setup):
607645

608646
assert rayjob.name == "test-job"
609647
assert rayjob.entrypoint == "python -c 'print()'"
610-
assert rayjob.cluster_name == "test-cluster"
648+
assert rayjob.cluster_name == "test-job-cluster" # Generated from job name
611649
assert rayjob.namespace == "test-ns"
612650
# Check that runtime_env is a RuntimeEnv object and contains pip dependencies
613651
assert isinstance(rayjob.runtime_env, RuntimeEnv)

0 commit comments

Comments
 (0)