Skip to content

Commit 98767c7

Browse files
add priorities and schedulingSpec to SDK
1 parent 9c9e833 commit 98767c7

File tree

7 files changed

+64
-9
lines changed

7 files changed

+64
-9
lines changed

src/codeflare_sdk/cluster/cluster.py

+2
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ def create_app_wrapper(self):
8484
instascale = self.config.instascale
8585
instance_types = self.config.machine_types
8686
env = self.config.envs
87+
priority = self.config.priority
8788
return generate_appwrapper(
8889
name=name,
8990
namespace=namespace,
@@ -98,6 +99,7 @@ def create_app_wrapper(self):
9899
instascale=instascale,
99100
instance_types=instance_types,
100101
env=env,
102+
priority=priority,
101103
)
102104

103105
# creates a new cluster with the provided or default spec

src/codeflare_sdk/cluster/config.py

+1
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,4 @@ class ClusterConfiguration:
4848
instascale: bool = False
4949
envs: dict = field(default_factory=dict)
5050
image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
51+
priority: str = "low"

src/codeflare_sdk/templates/new-template.yaml

+9-5
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@ metadata:
55
namespace: default
66
#new addition
77
labels:
8-
orderedinstance: "m4.xlarge_g4dn.xlarge"
8+
orderedinstance: "m5.4xlarge_g4dn.xlarge"
99
spec:
10-
priority: 9
10+
priority: 1
11+
schedulingSpec:
12+
minAvailable: 2
1113
resources:
1214
Items: []
1315
GenericItems:
@@ -112,6 +114,7 @@ spec:
112114
operator: In
113115
values:
114116
- "aw-kuberay"
117+
priorityClassName: "low-priority"
115118
containers:
116119
# The Ray head pod
117120
- env:
@@ -182,6 +185,7 @@ spec:
182185
operator: In
183186
values:
184187
- "aw-kuberay"
188+
priorityClassName: "low-priority"
185189
initContainers:
186190
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
187191
- name: init-myservice
@@ -205,12 +209,12 @@ spec:
205209
limits:
206210
cpu: "2"
207211
memory: "12G"
208-
nvidia.com/gpu: "1"
212+
nvidia.com/gpu: "0"
209213
requests:
210214
cpu: "2"
211215
memory: "12G"
212-
nvidia.com/gpu: "1"
213-
- replica: 1
216+
nvidia.com/gpu: "0"
217+
- replicas: 1
214218
generictemplate:
215219
kind: Route
216220
apiVersion: route.openshift.io/v1

src/codeflare_sdk/utils/generate_yaml.py

+37
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,27 @@ def update_labels(yaml, instascale, instance_types):
7676
metadata.pop("labels")
7777

7878

79+
def update_priority(yaml, item, priority):
80+
if priority not in ["low", "default", "high"]:
81+
sys.exit("Priority must be 'low', 'default', or 'high'")
82+
83+
priority_levels = {
84+
"low": (1, "low-priority"),
85+
"default": (5, "default-priority"),
86+
"high": (10, "high-priority"),
87+
}
88+
89+
priority_level = priority_levels[priority]
90+
spec = yaml.get("spec")
91+
spec["priority"] = priority_level[0]
92+
# spec["SchedulingSpec"]["priorityClassName"] = priority_level
93+
if "generictemplate" in item.keys():
94+
head = item.get("generictemplate").get("spec").get("headGroupSpec")
95+
worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0]
96+
head["template"]["spec"]["priorityClassName"] = priority_level[1]
97+
worker["template"]["spec"]["priorityClassName"] = priority_level[1]
98+
99+
79100
def update_custompodresources(
80101
item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
81102
):
@@ -155,6 +176,11 @@ def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu):
155176
limits["nvidia.com/gpu"] = gpu
156177

157178

179+
def update_scheduling_spec(yaml, workers):
180+
spec = yaml.get("spec")
181+
spec["schedulingSpec"]["minAvailable"] = workers + 1
182+
183+
158184
def update_nodes(
159185
item,
160186
appwrapper_name,
@@ -210,6 +236,7 @@ def generate_appwrapper(
210236
instascale: bool,
211237
instance_types: list,
212238
env,
239+
priority: str,
213240
):
214241
user_yaml = read_template(template)
215242
appwrapper_name, cluster_name = gen_names(name)
@@ -218,6 +245,8 @@ def generate_appwrapper(
218245
route_item = resources["resources"].get("GenericItems")[1]
219246
update_names(user_yaml, item, appwrapper_name, cluster_name, namespace)
220247
update_labels(user_yaml, instascale, instance_types)
248+
update_priority(user_yaml, item, priority)
249+
update_scheduling_spec(user_yaml, workers)
221250
update_custompodresources(
222251
item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
223252
)
@@ -314,6 +343,12 @@ def main(): # pragma: no cover
314343
default="default",
315344
help="Set the kubernetes namespace you want to deploy your cluster to. Default. If left blank, uses the 'default' namespace",
316345
)
346+
parser.add_argument(
347+
"--priority",
348+
required=False,
349+
default="low",
350+
help="Set the priority of the cluster. Default is 'low'. Options are 'low', 'default', 'high'",
351+
)
317352

318353
args = parser.parse_args()
319354
name = args.name
@@ -328,6 +363,7 @@ def main(): # pragma: no cover
328363
instascale = args.instascale
329364
instance_types = args.instance_types
330365
namespace = args.namespace
366+
priority = args.priority
331367
env = {}
332368

333369
outfile = generate_appwrapper(
@@ -344,6 +380,7 @@ def main(): # pragma: no cover
344380
instascale,
345381
instance_types,
346382
env,
383+
priority,
347384
)
348385
return outfile
349386

tests/test-case-cmd.yaml

+6-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ metadata:
44
name: unit-cmd-cluster
55
namespace: default
66
spec:
7-
priority: 9
7+
priority: 1
88
resources:
99
GenericItems:
1010
- custompodresources:
@@ -88,6 +88,7 @@ spec:
8888
cpu: 2
8989
memory: 8G
9090
nvidia.com/gpu: 0
91+
priorityClassName: low-priority
9192
rayVersion: 1.12.0
9293
workerGroupSpecs:
9394
- groupName: small-group-unit-cmd-cluster
@@ -136,6 +137,7 @@ spec:
136137
do echo waiting for myservice; sleep 2; done
137138
image: busybox:1.28
138139
name: init-myservice
140+
priorityClassName: low-priority
139141
replicas: 1
140142
- generictemplate:
141143
apiVersion: route.openshift.io/v1
@@ -151,5 +153,7 @@ spec:
151153
to:
152154
kind: Service
153155
name: unit-cmd-cluster-head-svc
154-
replica: 1
156+
replicas: 1
155157
Items: []
158+
schedulingSpec:
159+
minAvailable: 3

tests/test-case.yaml

+6-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ metadata:
66
name: unit-test-cluster
77
namespace: ns
88
spec:
9-
priority: 9
9+
priority: 1
1010
resources:
1111
GenericItems:
1212
- custompodresources:
@@ -99,6 +99,7 @@ spec:
9999
cpu: 2
100100
memory: 8G
101101
nvidia.com/gpu: 0
102+
priorityClassName: low-priority
102103
rayVersion: 1.12.0
103104
workerGroupSpecs:
104105
- groupName: small-group-unit-test-cluster
@@ -156,6 +157,7 @@ spec:
156157
do echo waiting for myservice; sleep 2; done
157158
image: busybox:1.28
158159
name: init-myservice
160+
priorityClassName: low-priority
159161
replicas: 1
160162
- generictemplate:
161163
apiVersion: route.openshift.io/v1
@@ -171,5 +173,7 @@ spec:
171173
to:
172174
kind: Service
173175
name: unit-test-cluster-head-svc
174-
replica: 1
176+
replicas: 1
175177
Items: []
178+
schedulingSpec:
179+
minAvailable: 3

tests/unit_test.py

+3
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ def test_config_creation():
212212
gpu=7,
213213
instascale=True,
214214
machine_types=["cpu.small", "gpu.large"],
215+
priority="low",
215216
)
216217

217218
assert config.name == "unit-test-cluster" and config.namespace == "ns"
@@ -226,11 +227,13 @@ def test_config_creation():
226227
assert config.template == f"{parent}/src/codeflare_sdk/templates/new-template.yaml"
227228
assert config.instascale
228229
assert config.machine_types == ["cpu.small", "gpu.large"]
230+
assert config.priority == "low"
229231
return config
230232

231233

232234
def test_cluster_creation():
233235
cluster = Cluster(test_config_creation())
236+
print(cluster.app_wrapper_yaml)
234237
assert cluster.app_wrapper_yaml == "unit-test-cluster.yaml"
235238
assert cluster.app_wrapper_name == "unit-test-cluster"
236239
assert filecmp.cmp(

0 commit comments

Comments
 (0)