Skip to content

Commit 47098b4

Browse files
committed
[RHOAIENG-9004] Adjust existing test and workflow for GPU testing
1 parent 4673aeb commit 47098b4

File tree

7 files changed

+92
-47
lines changed

7 files changed

+92
-47
lines changed

.github/workflows/e2e_tests.yaml

+15-7
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ concurrency:
2727
jobs:
2828
kubernetes-e2e:
2929

30-
runs-on: ubuntu-20.04-4core
30+
runs-on: ubuntu-20.04-4core-gpu
3131

3232
steps:
3333
- name: Checkout code
@@ -52,33 +52,41 @@ jobs:
5252
with:
5353
token: ${{ secrets.GITHUB_TOKEN }}
5454

55+
- name: Setup NVidia GPU environment for KinD
56+
uses: ./common/github-actions/nvidia-gpu-setup
57+
5558
- name: Setup and start KinD cluster
5659
uses: ./common/github-actions/kind
5760

61+
- name: Install NVidia GPU operator for KinD
62+
uses: ./common/github-actions/nvidia-gpu-operator
63+
5864
- name: Deploy CodeFlare stack
5965
id: deploy
6066
run: |
6167
echo Setting up CodeFlare stack
6268
make setup-e2e
6369
6470
echo Deploying CodeFlare operator
65-
IMG="${REGISTRY_ADDRESS}"/codeflare-operator
66-
make image-push -e IMG="${IMG}"
71+
IMG=localhost/codeflare-operator:test
72+
make image-build -e IMG="${IMG}"
73+
podman save -o cfo.tar ${IMG}
74+
kind load image-archive cfo.tar --name cluster --verbosity 1000
6775
make deploy -e IMG="${IMG}" -e ENV="e2e"
6876
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
6977
7078
- name: Run e2e tests
7179
run: |
72-
export CODEFLARE_TEST_TIMEOUT_SHORT=1m
73-
export CODEFLARE_TEST_TIMEOUT_MEDIUM=5m
74-
export CODEFLARE_TEST_TIMEOUT_LONG=10m
80+
export CODEFLARE_TEST_TIMEOUT_SHORT=3m
81+
export CODEFLARE_TEST_TIMEOUT_MEDIUM=10m
82+
export CODEFLARE_TEST_TIMEOUT_LONG=20m
7583
export CODEFLARE_TEST_TIMEOUT_GPU_PROVISIONING=30m
7684
7785
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
7886
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
7987
8088
set -euo pipefail
81-
go test -timeout 30m -v ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
89+
go test -timeout 60m -v ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
8290
8391
- name: Print CodeFlare operator logs
8492
if: always() && steps.deploy.outcome == 'success'

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# BEGIN -- workaround lack of go-toolset for golang 1.22
44

5-
ARG GOLANG_IMAGE=golang:1.22
5+
ARG GOLANG_IMAGE=docker.io/library/golang:1.22
66

77
ARG GOARCH=amd64
88

config/e2e/patch_resources.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
- op: remove
22
path: /spec/template/spec/containers/0/resources
3+
- op: replace
4+
path: /spec/template/spec/containers/0/imagePullPolicy
5+
value: IfNotPresent

test/e2e/mnist.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from pytorch_lightning.callbacks.progress import TQDMProgressBar
2121
from torch import nn
2222
from torch.nn import functional as F
23-
from torch.utils.data import DataLoader, random_split, RandomSampler
23+
from torch.utils.data import DataLoader, random_split
2424
from torchmetrics import Accuracy
2525
from torchvision import transforms
2626
from torchvision.datasets import MNIST
@@ -36,6 +36,9 @@
3636
print("MNIST_DATASET_URL: is ", os.getenv("MNIST_DATASET_URL"))
3737
MNIST_DATASET_URL = os.getenv("MNIST_DATASET_URL")
3838

39+
print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
40+
ACCELERATOR = os.getenv("ACCELERATOR")
41+
3942
class LitMNIST(LightningModule):
4043
def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
4144

@@ -158,7 +161,7 @@ def setup(self, stage=None):
158161
)
159162

160163
def train_dataloader(self):
161-
return DataLoader(self.mnist_train, batch_size=BATCH_SIZE, sampler=RandomSampler(self.mnist_train, num_samples=1000))
164+
return DataLoader(self.mnist_train, batch_size=BATCH_SIZE)
162165

163166
def val_dataloader(self):
164167
return DataLoader(self.mnist_val, batch_size=BATCH_SIZE)
@@ -176,13 +179,12 @@ def test_dataloader(self):
176179

177180
# Initialize a trainer
178181
trainer = Trainer(
179-
accelerator="auto",
182+
accelerator=ACCELERATOR,
180183
# devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs
181184
max_epochs=3,
182185
callbacks=[TQDMProgressBar(refresh_rate=20)],
183186
num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)),
184187
devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)),
185-
replace_sampler_ddp=False,
186188
strategy="ddp",
187189
)
188190

test/e2e/mnist_pytorch_appwrapper_test.go

+10-2
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,17 @@ import (
3030
"k8s.io/apimachinery/pkg/runtime"
3131
)
3232

33+
func TestMnistPyTorchAppWrapperCpu(t *testing.T) {
34+
runMnistPyTorchAppWrapper(t, "cpu")
35+
}
36+
37+
func TestMnistPyTorchAppWrapperGpu(t *testing.T) {
38+
runMnistPyTorchAppWrapper(t, "gpu")
39+
}
40+
3341
// Trains the MNIST dataset as a batch Job in an AppWrapper, and asserts successful completion of the training job.
34-
func TestMNISTPyTorchAppWrapper(t *testing.T) {
42+
func runMnistPyTorchAppWrapper(t *testing.T, accelerator string) {
3543
test := With(t)
36-
test.T().Parallel()
3744

3845
// Create a namespace and localqueue in that namespace
3946
namespace := test.NewTestNamespace()
@@ -85,6 +92,7 @@ func TestMNISTPyTorchAppWrapper(t *testing.T) {
8592
{Name: "MNIST_DATASET_URL", Value: GetMnistDatasetURL()},
8693
{Name: "PIP_INDEX_URL", Value: GetPipIndexURL()},
8794
{Name: "PIP_TRUSTED_HOST", Value: GetPipTrustedHost()},
95+
{Name: "ACCELERATOR", Value: accelerator},
8896
},
8997
Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"},
9098
VolumeMounts: []corev1.VolumeMount{

test/e2e/mnist_rayjob_raycluster_test.go

+53-31
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package e2e
1818

1919
import (
2020
"crypto/tls"
21+
"fmt"
2122
"net/http"
2223
"net/url"
2324
"testing"
@@ -36,9 +37,17 @@ import (
3637

3738
// Trains the MNIST dataset as a RayJob, executed by a Ray cluster
3839
// directly managed by Kueue, and asserts successful completion of the training job.
39-
func TestMNISTRayJobRayCluster(t *testing.T) {
40+
41+
func TestMnistRayJobRayClusterCpu(t *testing.T) {
42+
runMnistRayJobRayCluster(t, "cpu", 0)
43+
}
44+
45+
func TestMnistRayJobRayClusterGpu(t *testing.T) {
46+
runMnistRayJobRayCluster(t, "gpu", 1)
47+
}
48+
49+
func runMnistRayJobRayCluster(t *testing.T, accelerator string, numberOfGpus int) {
4050
test := With(t)
41-
test.T().Parallel()
4251

4352
// Create a namespace and localqueue in that namespace
4453
namespace := test.NewTestNamespace()
@@ -51,7 +60,7 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
5160
test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name)
5261

5362
// Create RayCluster and assign it to the localqueue
54-
rayCluster := constructRayCluster(test, namespace, mnist)
63+
rayCluster := constructRayCluster(test, namespace, mnist, numberOfGpus)
5564
AssignToLocalQueue(rayCluster, localQueue)
5665
rayCluster, err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Create(test.Ctx(), rayCluster, metav1.CreateOptions{})
5766
test.Expect(err).NotTo(HaveOccurred())
@@ -62,7 +71,7 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
6271
Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))
6372

6473
// Create RayJob
65-
rayJob := constructRayJob(test, namespace, rayCluster)
74+
rayJob := constructRayJob(test, namespace, rayCluster, accelerator, numberOfGpus)
6675
rayJob, err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{})
6776
test.Expect(err).NotTo(HaveOccurred())
6877
test.T().Logf("Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
@@ -88,10 +97,17 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
8897
To(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded)))
8998
}
9099

100+
func TestMnistRayJobRayClusterAppWrapperCpu(t *testing.T) {
101+
runMnistRayJobRayClusterAppWrapper(t, "cpu", 0)
102+
}
103+
104+
func TestMnistRayJobRayClusterAppWrapperGpu(t *testing.T) {
105+
runMnistRayJobRayClusterAppWrapper(t, "gpu", 1)
106+
}
107+
91108
// Same as TestMNISTRayJobRayCluster, except the RayCluster is wrapped in an AppWrapper
92-
func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) {
109+
func runMnistRayJobRayClusterAppWrapper(t *testing.T, accelerator string, numberOfGpus int) {
93110
test := With(t)
94-
test.T().Parallel()
95111

96112
// Create a namespace and localqueue in that namespace
97113
namespace := test.NewTestNamespace()
@@ -104,7 +120,7 @@ func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) {
104120
test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name)
105121

106122
// Create RayCluster, wrap in AppWrapper and assign to localqueue
107-
rayCluster := constructRayCluster(test, namespace, mnist)
123+
rayCluster := constructRayCluster(test, namespace, mnist, numberOfGpus)
108124
aw := &mcadv1beta2.AppWrapper{
109125
TypeMeta: metav1.TypeMeta{
110126
APIVersion: mcadv1beta2.GroupVersion.String(),
@@ -140,7 +156,7 @@ func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) {
140156
Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))
141157

142158
// Create RayJob
143-
rayJob := constructRayJob(test, namespace, rayCluster)
159+
rayJob := constructRayJob(test, namespace, rayCluster, accelerator, numberOfGpus)
144160
rayJob, err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{})
145161
test.Expect(err).NotTo(HaveOccurred())
146162
test.T().Logf("Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
@@ -183,7 +199,7 @@ func constructMNISTConfigMap(test Test, namespace *corev1.Namespace) *corev1.Con
183199
}
184200
}
185201

186-
func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.ConfigMap) *rayv1.RayCluster {
202+
func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.ConfigMap, numberOfGpus int) *rayv1.RayCluster {
187203
return &rayv1.RayCluster{
188204
TypeMeta: metav1.TypeMeta{
189205
APIVersion: rayv1.GroupVersion.String(),
@@ -236,24 +252,6 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
236252
corev1.ResourceMemory: resource.MustParse("2G"),
237253
},
238254
},
239-
VolumeMounts: []corev1.VolumeMount{
240-
{
241-
Name: "mnist",
242-
MountPath: "/home/ray/jobs",
243-
},
244-
},
245-
},
246-
},
247-
Volumes: []corev1.Volume{
248-
{
249-
Name: "mnist",
250-
VolumeSource: corev1.VolumeSource{
251-
ConfigMap: &corev1.ConfigMapVolumeSource{
252-
LocalObjectReference: corev1.LocalObjectReference{
253-
Name: mnist.Name,
254-
},
255-
},
256-
},
257255
},
258256
},
259257
},
@@ -282,11 +280,31 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
282280
Resources: corev1.ResourceRequirements{
283281
Requests: corev1.ResourceList{
284282
corev1.ResourceCPU: resource.MustParse("250m"),
285-
corev1.ResourceMemory: resource.MustParse("256Mi"),
283+
corev1.ResourceMemory: resource.MustParse("1G"),
284+
"nvidia.com/gpu": resource.MustParse(fmt.Sprint(numberOfGpus)),
286285
},
287286
Limits: corev1.ResourceList{
288-
corev1.ResourceCPU: resource.MustParse("1"),
289-
corev1.ResourceMemory: resource.MustParse("2G"),
287+
corev1.ResourceCPU: resource.MustParse("2"),
288+
corev1.ResourceMemory: resource.MustParse("4G"),
289+
"nvidia.com/gpu": resource.MustParse(fmt.Sprint(numberOfGpus)),
290+
},
291+
},
292+
VolumeMounts: []corev1.VolumeMount{
293+
{
294+
Name: "mnist",
295+
MountPath: "/home/ray/jobs",
296+
},
297+
},
298+
},
299+
},
300+
Volumes: []corev1.Volume{
301+
{
302+
Name: "mnist",
303+
VolumeSource: corev1.VolumeSource{
304+
ConfigMap: &corev1.ConfigMapVolumeSource{
305+
LocalObjectReference: corev1.LocalObjectReference{
306+
Name: mnist.Name,
307+
},
290308
},
291309
},
292310
},
@@ -299,7 +317,7 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
299317
}
300318
}
301319

302-
func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayCluster) *rayv1.RayJob {
320+
func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayCluster, accelerator string, numberOfGpus int) *rayv1.RayJob {
303321
return &rayv1.RayJob{
304322
TypeMeta: metav1.TypeMeta{
305323
APIVersion: rayv1.GroupVersion.String(),
@@ -320,6 +338,7 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
320338
MNIST_DATASET_URL: "` + GetMnistDatasetURL() + `"
321339
PIP_INDEX_URL: "` + GetPipIndexURL() + `"
322340
PIP_TRUSTED_HOST: "` + GetPipTrustedHost() + `"
341+
ACCELERATOR: "` + accelerator + `"
323342
`,
324343
ClusterSelector: map[string]string{
325344
RayJobDefaultClusterSelectorKey: rayCluster.Name,
@@ -336,6 +355,9 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
336355
},
337356
},
338357
},
358+
EntrypointNumCpus: 2,
359+
// Using EntrypointNumGpus doesn't seem to work properly on KinD cluster with GPU, EntrypointNumCpus seems reliable
360+
EntrypointNumGpus: float32(numberOfGpus),
339361
},
340362
}
341363
}

test/e2e/setup.sh

+4-2
Original file line numberDiff line numberDiff line change
@@ -84,12 +84,14 @@ metadata:
8484
spec:
8585
namespaceSelector: {} # match all.
8686
resourceGroups:
87-
- coveredResources: ["cpu","memory"]
87+
- coveredResources: ["cpu","memory", "nvidia.com/gpu"]
8888
flavors:
8989
- name: "default-flavor"
9090
resources:
9191
- name: "cpu"
9292
nominalQuota: 4
9393
- name: "memory"
94-
nominalQuota: "4G"
94+
nominalQuota: "20G"
95+
- name: "nvidia.com/gpu"
96+
nominalQuota: "1"
9597
EOF

0 commit comments

Comments
 (0)