Skip to content

Commit 1fe04c3

Browse files
authored
Adding AppWrapper from File System Management (#120)
* Adding AppWrapper from FS Management * Added remaining docstrings * Added unit tests for full coverage * notebook bugfix
1 parent 3527061 commit 1fe04c3

File tree

6 files changed

+333
-4
lines changed

6 files changed

+333
-4
lines changed

demo-notebooks/guided-demos/3_basic_interactive.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@
139139
"# establish connection to ray cluster\n",
140140
"\n",
141141
"#install additionall libraries that will be required for model training\n",
142-
"runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\"]}\n",
142+
"runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\", \"accelerate\"]}\n",
143143
"\n",
144144
"ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env)\n",
145145
"\n",

demo-notebooks/guided-demos/notebook-ex-outputs/3_basic_interactive.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@
231231
"# establish connection to ray cluster\n",
232232
"\n",
233233
"#install additionall libraries that will be required for model training\n",
234-
"runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\"]}\n",
234+
"runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\", \"accelerate\"]}\n",
235235
"\n",
236236
"ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env)\n",
237237
"\n",

src/codeflare_sdk/cluster/awload.py

+106
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Copyright 2022 IBM, Red Hat
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
The awload sub-module contains the definition of the AWManager object, which handles
17+
submission and deletion of existing AppWrappers from a user's file system.
18+
"""
19+
20+
from os.path import isfile
21+
import errno
22+
import os
23+
import openshift as oc
24+
import yaml
25+
26+
27+
class AWManager:
28+
"""
29+
An object for submitting and removing existing AppWrapper yamls
30+
to be added to the MCAD queue.
31+
"""
32+
33+
def __init__(self, filename: str) -> None:
34+
"""
35+
Create the AppWrapper Manager object by passing in an
36+
AppWrapper yaml file
37+
"""
38+
if not isfile(filename):
39+
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename)
40+
self.filename = filename
41+
try:
42+
with open(self.filename) as f:
43+
awyaml = yaml.load(f, Loader=yaml.FullLoader)
44+
assert awyaml["kind"] == "AppWrapper"
45+
self.name = awyaml["metadata"]["name"]
46+
self.namespace = awyaml["metadata"]["namespace"]
47+
except:
48+
raise ValueError(
49+
f"{filename } is not a correctly formatted AppWrapper yaml"
50+
)
51+
self.submitted = False
52+
53+
def submit(self) -> None:
54+
"""
55+
Attempts to create the AppWrapper custom resource using the yaml file
56+
"""
57+
try:
58+
with oc.project(self.namespace):
59+
oc.invoke("create", ["-f", self.filename])
60+
except oc.OpenShiftPythonException as osp: # pragma: no cover
61+
error_msg = osp.result.err()
62+
if "Unauthorized" in error_msg or "Forbidden" in error_msg:
63+
raise PermissionError(
64+
"Action not permitted, have you put in correct/up-to-date auth credentials?"
65+
)
66+
elif "AlreadyExists" in error_msg:
67+
raise FileExistsError(
68+
f"An AppWrapper of the name {self.name} already exists in namespace {self.namespace}"
69+
)
70+
raise osp
71+
72+
self.submitted = True
73+
print(f"AppWrapper {self.filename} submitted!")
74+
75+
def remove(self) -> None:
76+
"""
77+
Attempts to delete the AppWrapper custom resource matching the name in the yaml,
78+
if submitted by this manager.
79+
"""
80+
if not self.submitted:
81+
print("AppWrapper not submitted by this manager yet, nothing to remove")
82+
return
83+
84+
try:
85+
with oc.project(self.namespace):
86+
oc.invoke("delete", ["AppWrapper", self.name])
87+
except oc.OpenShiftPythonException as osp: # pragma: no cover
88+
error_msg = osp.result.err()
89+
if (
90+
'the server doesn\'t have a resource type "AppWrapper"' in error_msg
91+
or "forbidden" in error_msg
92+
or "Unauthorized" in error_msg
93+
or "Missing or incomplete configuration" in error_msg
94+
):
95+
raise PermissionError(
96+
"Action not permitted, have you put in correct/up-to-date auth credentials?"
97+
)
98+
elif "not found" in error_msg:
99+
self.submitted = False
100+
print("AppWrapper not found, was deleted in another manner")
101+
return
102+
else:
103+
raise osp
104+
105+
self.submitted = False
106+
print(f"AppWrapper {self.name} removed!")

src/codeflare_sdk/cluster/cluster.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def down(self):
136136
or "Missing or incomplete configuration" in error_msg
137137
):
138138
raise PermissionError(
139-
"Action not permitted, have you run cluster.up() yet?"
139+
"Action not permitted, have you run auth.login()/cluster.up() yet?"
140140
)
141141
elif "not found" in error_msg:
142142
print("Cluster not found, have you run cluster.up() yet?")

tests/test-case-bad.yaml

+175
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
apiVersion: mcad.ibm.com/v1beta1
2+
kind: AppsWrapper
3+
metadata:
4+
labels:
5+
orderedinstance: cpu.small_gpu.large
6+
nam: unit-test-cluster
7+
namspace: ns
8+
spec:
9+
priority: 9
10+
resources:
11+
GenericItems:
12+
- custompodresources:
13+
- limits:
14+
cpu: 2
15+
memory: 8G
16+
nvidia.com/gpu: 0
17+
replicas: 1
18+
requests:
19+
cpu: 2
20+
memory: 8G
21+
nvidia.com/gpu: 0
22+
- limits:
23+
cpu: 4
24+
memory: 6G
25+
nvidia.com/gpu: 7
26+
replicas: 2
27+
requests:
28+
cpu: 3
29+
memory: 5G
30+
nvidia.com/gpu: 7
31+
generictemplate:
32+
apiVersion: ray.io/v1alpha1
33+
kind: RayCluster
34+
metadata:
35+
labels:
36+
appwrapper.mcad.ibm.com: unit-test-cluster
37+
controller-tools.k8s.io: '1.0'
38+
name: unit-test-cluster
39+
namespace: ns
40+
spec:
41+
autoscalerOptions:
42+
idleTimeoutSeconds: 60
43+
imagePullPolicy: Always
44+
resources:
45+
limits:
46+
cpu: 500m
47+
memory: 512Mi
48+
requests:
49+
cpu: 500m
50+
memory: 512Mi
51+
upscalingMode: Default
52+
enableInTreeAutoscaling: false
53+
headGroupSpec:
54+
rayStartParams:
55+
block: 'true'
56+
dashboard-host: 0.0.0.0
57+
num-gpus: '0'
58+
serviceType: ClusterIP
59+
template:
60+
spec:
61+
affinity:
62+
nodeAffinity:
63+
requiredDuringSchedulingIgnoredDuringExecution:
64+
nodeSelectorTerms:
65+
- matchExpressions:
66+
- key: unit-test-cluster
67+
operator: In
68+
values:
69+
- unit-test-cluster
70+
containers:
71+
- env:
72+
- name: MY_POD_IP
73+
valueFrom:
74+
fieldRef:
75+
fieldPath: status.podIP
76+
image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
77+
imagePullPolicy: Always
78+
lifecycle:
79+
preStop:
80+
exec:
81+
command:
82+
- /bin/sh
83+
- -c
84+
- ray stop
85+
name: ray-head
86+
ports:
87+
- containerPort: 6379
88+
name: gcs
89+
- containerPort: 8265
90+
name: dashboard
91+
- containerPort: 10001
92+
name: client
93+
resources:
94+
limits:
95+
cpu: 2
96+
memory: 8G
97+
nvidia.com/gpu: 0
98+
requests:
99+
cpu: 2
100+
memory: 8G
101+
nvidia.com/gpu: 0
102+
rayVersion: 1.12.0
103+
workerGroupSpecs:
104+
- groupName: small-group-unit-test-cluster
105+
maxReplicas: 2
106+
minReplicas: 2
107+
rayStartParams:
108+
block: 'true'
109+
num-gpus: '7'
110+
replicas: 2
111+
template:
112+
metadata:
113+
annotations:
114+
key: value
115+
labels:
116+
key: value
117+
spec:
118+
affinity:
119+
nodeAffinity:
120+
requiredDuringSchedulingIgnoredDuringExecution:
121+
nodeSelectorTerms:
122+
- matchExpressions:
123+
- key: unit-test-cluster
124+
operator: In
125+
values:
126+
- unit-test-cluster
127+
containers:
128+
- env:
129+
- name: MY_POD_IP
130+
valueFrom:
131+
fieldRef:
132+
fieldPath: status.podIP
133+
image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
134+
lifecycle:
135+
preStop:
136+
exec:
137+
command:
138+
- /bin/sh
139+
- -c
140+
- ray stop
141+
name: machine-learning
142+
resources:
143+
limits:
144+
cpu: 4
145+
memory: 6G
146+
nvidia.com/gpu: 7
147+
requests:
148+
cpu: 3
149+
memory: 5G
150+
nvidia.com/gpu: 7
151+
initContainers:
152+
- command:
153+
- sh
154+
- -c
155+
- until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local;
156+
do echo waiting for myservice; sleep 2; done
157+
image: busybox:1.28
158+
name: init-myservice
159+
replicas: 1
160+
- generictemplate:
161+
apiVersion: route.openshift.io/v1
162+
kind: Route
163+
metadata:
164+
labels:
165+
odh-ray-cluster-service: unit-test-cluster-head-svc
166+
name: ray-dashboard-unit-test-cluster
167+
namespace: ns
168+
spec:
169+
port:
170+
targetPort: dashboard
171+
to:
172+
kind: Service
173+
name: unit-test-cluster-head-svc
174+
replica: 1
175+
Items: []

tests/unit_test.py

+49-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
parent = Path(__file__).resolve().parents[1]
2222
sys.path.append(str(parent) + "/src")
2323

24+
from codeflare_sdk.cluster.awload import AWManager
2425
from codeflare_sdk.cluster.cluster import (
2526
Cluster,
2627
ClusterConfiguration,
@@ -1932,7 +1933,54 @@ def parse_j(cmd):
19321933
return f"{max_worker}x{gpu}"
19331934

19341935

1935-
# Make sure to keep this function and the efollowing function at the end of the file
1936+
def test_AWManager_creation():
1937+
testaw = AWManager("test.yaml")
1938+
assert testaw.name == "test"
1939+
assert testaw.namespace == "ns"
1940+
assert testaw.submitted == False
1941+
try:
1942+
testaw = AWManager("fake")
1943+
except Exception as e:
1944+
assert type(e) == FileNotFoundError
1945+
assert str(e) == "[Errno 2] No such file or directory: 'fake'"
1946+
try:
1947+
testaw = AWManager("tests/test-case-bad.yaml")
1948+
except Exception as e:
1949+
assert type(e) == ValueError
1950+
assert (
1951+
str(e)
1952+
== "tests/test-case-bad.yaml is not a correctly formatted AppWrapper yaml"
1953+
)
1954+
1955+
1956+
def arg_check_aw_create_effect(*args):
1957+
assert args[0] == "create"
1958+
assert args[1] == ["-f", "test.yaml"]
1959+
1960+
1961+
def arg_check_aw_delete_effect(*args):
1962+
assert args[0] == "delete"
1963+
assert args[1] == ["AppWrapper", "test"]
1964+
1965+
1966+
def test_AWManager_submit_remove(mocker, capsys):
1967+
testaw = AWManager("test.yaml")
1968+
testaw.remove()
1969+
captured = capsys.readouterr()
1970+
assert (
1971+
captured.out
1972+
== "AppWrapper not submitted by this manager yet, nothing to remove\n"
1973+
)
1974+
assert testaw.submitted == False
1975+
mocker.patch("openshift.invoke", side_effect=arg_check_aw_create_effect)
1976+
testaw.submit()
1977+
assert testaw.submitted == True
1978+
mocker.patch("openshift.invoke", side_effect=arg_check_aw_delete_effect)
1979+
testaw.remove()
1980+
assert testaw.submitted == False
1981+
1982+
1983+
# Make sure to keep this function and the following function at the end of the file
19361984
def test_cmd_line_generation():
19371985
os.system(
19381986
f"python3 {parent}/src/codeflare_sdk/utils/generate_yaml.py --name=unit-cmd-cluster --min-cpu=1 --max-cpu=1 --min-memory=2 --max-memory=2 --gpu=1 --workers=2 --template=src/codeflare_sdk/templates/new-template.yaml"

0 commit comments

Comments
 (0)