Skip to content

Commit 72cace9

Browse files
sutaakaropenshift-merge-bot[bot]
authored andcommitted
Enable TestMNISTRayClusterSDK test
1 parent e95759a commit 72cace9

File tree

6 files changed

+107
-94
lines changed

6 files changed

+107
-94
lines changed

.github/actions/kind/action.yml

+16
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
name: "Set up KinD"
22
description: "Step to start and configure KinD cluster"
33

4+
inputs:
5+
kind-node-hostname:
6+
description: "Hostname of the main kind node"
7+
required: false
8+
default: kind
9+
410
runs:
511
using: "composite"
612
steps:
@@ -56,3 +62,13 @@ runs:
5662
curl https://raw.githubusercontent.com/kubernetes/ingress-nginx/"${VERSION}"/deploy/static/provider/kind/deploy.yaml | sed "s/--publish-status-address=localhost/--report-node-internal-ip-address\\n - --status-update-interval=10/g" | kubectl apply -f -
5763
kubectl annotate ingressclass nginx "ingressclass.kubernetes.io/is-default-class=true"
5864
kubectl -n ingress-nginx wait --timeout=300s --for=condition=Available deployments --all
65+
66+
- name: Add ${{ inputs.kind-node-hostname }} host to machine hosts
67+
shell: bash
68+
run: echo "127.0.0.1 ${{ inputs.kind-node-hostname }}" | sudo tee -a /etc/hosts
69+
70+
- name: Set env variables for tests to properly leverage KinD cluster
71+
shell: bash
72+
run: |
73+
echo "CLUSTER_TYPE=KIND" >> $GITHUB_ENV
74+
echo "CLUSTER_HOSTNAME=${{ inputs.kind-node-hostname }}" >> $GITHUB_ENV

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ go 1.19
55
require (
66
github.com/onsi/gomega v1.27.10
77
github.com/openshift/api v0.0.0-20230213134911-7ba313770556
8-
github.com/project-codeflare/codeflare-common v0.0.0-20231023092720-93d03492db16
8+
github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb
99
github.com/project-codeflare/instascale v0.3.0
1010
github.com/project-codeflare/multi-cluster-app-dispatcher v1.38.0
1111
github.com/ray-project/kuberay/ray-operator v1.0.0-rc.1

go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -391,8 +391,8 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
391391
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
392392
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
393393
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
394-
github.com/project-codeflare/codeflare-common v0.0.0-20231023092720-93d03492db16 h1:TRMLDP6IYt0CAd3+BkvY/r2lkpjI3sOsxf3tnQojZ9k=
395-
github.com/project-codeflare/codeflare-common v0.0.0-20231023092720-93d03492db16/go.mod h1:zdi2GCYJX+QyxFWyCLMoTme3NMz/aucWDJWMqKfigxk=
394+
github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb h1:L2Gdr2SlvshDKZY2KK6507AwzQ1NSfRbMQuz5dOsYNM=
395+
github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb/go.mod h1:zdi2GCYJX+QyxFWyCLMoTme3NMz/aucWDJWMqKfigxk=
396396
github.com/project-codeflare/instascale v0.3.0 h1:PSlwbqqUsFTkTQ5KUhMFRebfokySnEZwav97xZixLQs=
397397
github.com/project-codeflare/instascale v0.3.0/go.mod h1:IU1Wl+zqTpMpZ49BOcr6U+A6gF3AjcmFdKo9ZwP3TDI=
398398
github.com/project-codeflare/multi-cluster-app-dispatcher v1.38.0 h1:dU2Ev0SijdNm30Y9mjdKJL1Fp6l07rnRBKhSbx1kX9g=

test/e2e/mnist_raycluster_sdk.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import sys
2+
import os
23

34
from time import sleep
45

@@ -8,17 +9,38 @@
89
from codeflare_sdk.job.jobs import DDPJobDefinition
910

1011
namespace = sys.argv[1]
12+
ray_image = os.getenv('RAY_IMAGE')
13+
host = os.getenv('CLUSTER_HOSTNAME')
14+
15+
ingress_options = {}
16+
if host is not None:
17+
ingress_options = {
18+
"ingresses": [
19+
{
20+
"ingressName": "ray-dashboard",
21+
"port": 8265,
22+
"pathType": "Prefix",
23+
"path": "/",
24+
"host": host,
25+
},
26+
]
27+
}
28+
1129

1230
cluster = Cluster(ClusterConfiguration(
1331
name='mnist',
1432
namespace=namespace,
1533
num_workers=1,
34+
head_cpus='500m',
35+
head_memory=2,
1636
min_cpus='500m',
1737
max_cpus=1,
1838
min_memory=0.5,
19-
max_memory=1,
39+
max_memory=2,
2040
num_gpus=0,
2141
instascale=False,
42+
image=ray_image,
43+
ingress_options=ingress_options,
2244
))
2345

2446
cluster.up()

test/e2e/mnist_raycluster_sdk_test.go

+63-88
Original file line numberDiff line numberDiff line change
@@ -40,104 +40,59 @@ func TestMNISTRayClusterSDK(t *testing.T) {
4040
test := With(t)
4141
test.T().Parallel()
4242

43-
// Currently blocked by https://github.com/project-codeflare/codeflare-sdk/pull/251 , remove the skip once SDK with the PR is released
44-
test.T().Skip("Requires https://github.com/project-codeflare/codeflare-sdk/pull/251")
45-
4643
// Create a namespace
4744
namespace := test.NewTestNamespace()
4845

4946
// Test configuration
50-
config := &corev1.ConfigMap{
51-
TypeMeta: metav1.TypeMeta{
52-
APIVersion: corev1.SchemeGroupVersion.String(),
53-
Kind: "ConfigMap",
54-
},
55-
ObjectMeta: metav1.ObjectMeta{
56-
Name: "mnist-raycluster-sdk",
57-
Namespace: namespace.Name,
47+
config := CreateConfigMap(test, namespace.Name, map[string][]byte{
48+
// SDK script
49+
"mnist_raycluster_sdk.py": ReadFile(test, "mnist_raycluster_sdk.py"),
50+
// pip requirements
51+
"requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"),
52+
// MNIST training script
53+
"mnist.py": ReadFile(test, "mnist.py"),
54+
})
55+
56+
// Create RBAC, retrieve token for user with limited rights
57+
policyRules := []rbacv1.PolicyRule{
58+
{
59+
Verbs: []string{"get", "create", "delete", "list", "patch", "update"},
60+
APIGroups: []string{mcadv1beta1.GroupName},
61+
Resources: []string{"appwrappers"},
5862
},
59-
BinaryData: map[string][]byte{
60-
// SDK script
61-
"mnist_raycluster_sdk.py": ReadFile(test, "mnist_raycluster_sdk.py"),
62-
// pip requirements
63-
"requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"),
64-
// MNIST training script
65-
"mnist.py": ReadFile(test, "mnist.py"),
63+
{
64+
Verbs: []string{"get", "list"},
65+
APIGroups: []string{rayv1.GroupVersion.Group},
66+
Resources: []string{"rayclusters", "rayclusters/status"},
6667
},
67-
Immutable: Ptr(true),
68-
}
69-
config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{})
70-
test.Expect(err).NotTo(HaveOccurred())
71-
test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name)
72-
73-
// SDK client RBAC
74-
serviceAccount := &corev1.ServiceAccount{
75-
TypeMeta: metav1.TypeMeta{
76-
APIVersion: corev1.SchemeGroupVersion.String(),
77-
Kind: "ServiceAccount",
68+
{
69+
Verbs: []string{"get", "list"},
70+
APIGroups: []string{"route.openshift.io"},
71+
Resources: []string{"routes"},
7872
},
79-
ObjectMeta: metav1.ObjectMeta{
80-
Name: "sdk-user",
81-
Namespace: namespace.Name,
73+
{
74+
Verbs: []string{"get", "list"},
75+
APIGroups: []string{"networking.k8s.io"},
76+
Resources: []string{"ingresses"},
8277
},
8378
}
84-
serviceAccount, err = test.Client().Core().CoreV1().ServiceAccounts(namespace.Name).Create(test.Ctx(), serviceAccount, metav1.CreateOptions{})
85-
test.Expect(err).NotTo(HaveOccurred())
8679

87-
role := &rbacv1.Role{
88-
TypeMeta: metav1.TypeMeta{
89-
APIVersion: rbacv1.SchemeGroupVersion.String(),
90-
Kind: "Role",
91-
},
92-
ObjectMeta: metav1.ObjectMeta{
93-
Name: "sdk",
94-
Namespace: namespace.Name,
95-
},
96-
Rules: []rbacv1.PolicyRule{
97-
{
98-
Verbs: []string{"get", "create", "delete", "list", "patch", "update"},
99-
APIGroups: []string{mcadv1beta1.GroupName},
100-
Resources: []string{"appwrappers"},
101-
},
102-
{
103-
Verbs: []string{"get", "list"},
104-
APIGroups: []string{rayv1.GroupVersion.Group},
105-
Resources: []string{"rayclusters", "rayclusters/status"},
106-
},
107-
{
108-
Verbs: []string{"get", "list"},
109-
APIGroups: []string{"route.openshift.io"},
110-
Resources: []string{"routes"},
111-
},
80+
// Create cluster wide RBAC, required for SDK OpenShift check
81+
// TODO reevaluate once SDK change OpenShift detection logic
82+
clusterPolicyRules := []rbacv1.PolicyRule{
83+
{
84+
Verbs: []string{"get", "list"},
85+
APIGroups: []string{"config.openshift.io"},
86+
Resources: []string{"ingresses"},
87+
ResourceNames: []string{"cluster"},
11288
},
11389
}
114-
role, err = test.Client().Core().RbacV1().Roles(namespace.Name).Create(test.Ctx(), role, metav1.CreateOptions{})
115-
test.Expect(err).NotTo(HaveOccurred())
11690

117-
roleBinding := &rbacv1.RoleBinding{
118-
TypeMeta: metav1.TypeMeta{
119-
APIVersion: rbacv1.SchemeGroupVersion.String(),
120-
Kind: "RoleBinding",
121-
},
122-
ObjectMeta: metav1.ObjectMeta{
123-
Name: "sdk",
124-
},
125-
RoleRef: rbacv1.RoleRef{
126-
APIGroup: rbacv1.SchemeGroupVersion.Group,
127-
Kind: "Role",
128-
Name: role.Name,
129-
},
130-
Subjects: []rbacv1.Subject{
131-
{
132-
Kind: "ServiceAccount",
133-
APIGroup: corev1.SchemeGroupVersion.Group,
134-
Name: serviceAccount.Name,
135-
Namespace: serviceAccount.Namespace,
136-
},
137-
},
138-
}
139-
_, err = test.Client().Core().RbacV1().RoleBindings(namespace.Name).Create(test.Ctx(), roleBinding, metav1.CreateOptions{})
140-
test.Expect(err).NotTo(HaveOccurred())
91+
sa := CreateServiceAccount(test, namespace.Name)
92+
role := CreateRole(test, namespace.Name, policyRules)
93+
CreateRoleBinding(test, namespace.Name, sa, role)
94+
clusterRole := CreateClusterRole(test, clusterPolicyRules)
95+
CreateClusterRoleBinding(test, sa, clusterRole)
14196

14297
job := &batchv1.Job{
14398
TypeMeta: metav1.TypeMeta{
@@ -161,7 +116,8 @@ func TestMNISTRayClusterSDK(t *testing.T) {
161116
// See https://github.com/project-codeflare/codeflare-sdk/pull/146
162117
Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e",
163118
Env: []corev1.EnvVar{
164-
corev1.EnvVar{Name: "PYTHONUSERBASE", Value: "/workdir"},
119+
{Name: "PYTHONUSERBASE", Value: "/workdir"},
120+
{Name: "RAY_IMAGE", Value: GetRayImage()},
165121
},
166122
Command: []string{"/bin/sh", "-c", "pip install codeflare-sdk==" + GetCodeFlareSDKVersion() + " && cp /test/* . && python mnist_raycluster_sdk.py" + " " + namespace.Name},
167123
VolumeMounts: []corev1.VolumeMount{
@@ -206,12 +162,31 @@ func TestMNISTRayClusterSDK(t *testing.T) {
206162
},
207163
},
208164
RestartPolicy: corev1.RestartPolicyNever,
209-
ServiceAccountName: serviceAccount.Name,
165+
ServiceAccountName: sa.Name,
210166
},
211167
},
212168
},
213169
}
214-
job, err = test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{})
170+
if GetClusterType(test) == KindCluster {
171+
// Take first KinD node and redirect pod hostname requests there
172+
node := GetNodes(test)[0]
173+
hostname := GetClusterHostname(test)
174+
IP := GetNodeInternalIP(test, node)
175+
176+
test.T().Logf("Setting KinD cluster hostname '%s' to node IP '%s' for SDK pod", hostname, IP)
177+
job.Spec.Template.Spec.HostAliases = []corev1.HostAlias{
178+
{
179+
IP: IP,
180+
Hostnames: []string{hostname},
181+
},
182+
}
183+
184+
// Propagate hostname into Python code as env variable
185+
hostnameEnvVar := corev1.EnvVar{Name: "CLUSTER_HOSTNAME", Value: hostname}
186+
job.Spec.Template.Spec.Containers[0].Env = append(job.Spec.Template.Spec.Containers[0].Env, hostnameEnvVar)
187+
}
188+
189+
job, err := test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{})
215190
test.Expect(err).NotTo(HaveOccurred())
216191
test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name)
217192

test/e2e/mnist_rayjob_mcad_raycluster_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) {
108108
},
109109
Limits: corev1.ResourceList{
110110
corev1.ResourceCPU: resource.MustParse("1"),
111-
corev1.ResourceMemory: resource.MustParse("1G"),
111+
corev1.ResourceMemory: resource.MustParse("2G"),
112112
},
113113
},
114114
VolumeMounts: []corev1.VolumeMount{
@@ -168,7 +168,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) {
168168
},
169169
Limits: corev1.ResourceList{
170170
corev1.ResourceCPU: resource.MustParse("1"),
171-
corev1.ResourceMemory: resource.MustParse("1G"),
171+
corev1.ResourceMemory: resource.MustParse("2G"),
172172
},
173173
},
174174
},

0 commit comments

Comments
 (0)