Skip to content

Commit 1916b7f

Browse files
committed
[RHOAIENG-9004] Adjust existing test and workflow for GPU testing
1 parent 4673aeb commit 1916b7f

File tree

7 files changed

+37
-19
lines changed

7 files changed

+37
-19
lines changed

.github/workflows/e2e_tests.yaml

+15-7
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ concurrency:
2727
jobs:
2828
kubernetes-e2e:
2929

30-
runs-on: ubuntu-20.04-4core
30+
runs-on: ubuntu-20.04-4core-gpu
3131

3232
steps:
3333
- name: Checkout code
@@ -52,33 +52,41 @@ jobs:
5252
with:
5353
token: ${{ secrets.GITHUB_TOKEN }}
5454

55+
- name: Setup NVidia GPU environment for KinD
56+
uses: ./common/github-actions/nvidia-gpu-setup
57+
5558
- name: Setup and start KinD cluster
5659
uses: ./common/github-actions/kind
5760

61+
- name: Install NVidia GPU operator for KinD
62+
uses: ./common/github-actions/nvidia-gpu-operator
63+
5864
- name: Deploy CodeFlare stack
5965
id: deploy
6066
run: |
6167
echo Setting up CodeFlare stack
6268
make setup-e2e
6369
6470
echo Deploying CodeFlare operator
65-
IMG="${REGISTRY_ADDRESS}"/codeflare-operator
66-
make image-push -e IMG="${IMG}"
71+
IMG=localhost/codeflare-operator:test
72+
make image-build -e IMG="${IMG}"
73+
podman save -o cfo.tar ${IMG}
74+
kind load image-archive cfo.tar --name cluster --verbosity 1000
6775
make deploy -e IMG="${IMG}" -e ENV="e2e"
6876
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
6977
7078
- name: Run e2e tests
7179
run: |
72-
export CODEFLARE_TEST_TIMEOUT_SHORT=1m
73-
export CODEFLARE_TEST_TIMEOUT_MEDIUM=5m
74-
export CODEFLARE_TEST_TIMEOUT_LONG=10m
80+
export CODEFLARE_TEST_TIMEOUT_SHORT=3m
81+
export CODEFLARE_TEST_TIMEOUT_MEDIUM=10m
82+
export CODEFLARE_TEST_TIMEOUT_LONG=20m
7583
export CODEFLARE_TEST_TIMEOUT_GPU_PROVISIONING=30m
7684
7785
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
7886
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
7987
8088
set -euo pipefail
81-
go test -timeout 30m -v ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
89+
go test -timeout 60m -v ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
8290
8391
- name: Print CodeFlare operator logs
8492
if: always() && steps.deploy.outcome == 'success'

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# BEGIN -- workaround lack of go-toolset for golang 1.22
44

5-
ARG GOLANG_IMAGE=golang:1.22
5+
ARG GOLANG_IMAGE=docker.io/library/golang:1.22
66

77
ARG GOARCH=amd64
88

config/e2e/patch_resources.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
- op: remove
22
path: /spec/template/spec/containers/0/resources
3+
- op: replace
4+
path: /spec/template/spec/containers/0/imagePullPolicy
5+
value: IfNotPresent

test/e2e/mnist.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from pytorch_lightning.callbacks.progress import TQDMProgressBar
2121
from torch import nn
2222
from torch.nn import functional as F
23-
from torch.utils.data import DataLoader, random_split, RandomSampler
23+
from torch.utils.data import DataLoader, random_split
2424
from torchmetrics import Accuracy
2525
from torchvision import transforms
2626
from torchvision.datasets import MNIST
@@ -36,6 +36,10 @@
3636
print("MNIST_DATASET_URL: is ", os.getenv("MNIST_DATASET_URL"))
3737
MNIST_DATASET_URL = os.getenv("MNIST_DATASET_URL")
3838

39+
print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
40+
# ACCELERATOR = os.getenv("ACCELERATOR")
41+
ACCELERATOR ="gpu"
42+
3943
class LitMNIST(LightningModule):
4044
def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
4145

@@ -158,7 +162,7 @@ def setup(self, stage=None):
158162
)
159163

160164
def train_dataloader(self):
161-
return DataLoader(self.mnist_train, batch_size=BATCH_SIZE, sampler=RandomSampler(self.mnist_train, num_samples=1000))
165+
return DataLoader(self.mnist_train, batch_size=BATCH_SIZE)
162166

163167
def val_dataloader(self):
164168
return DataLoader(self.mnist_val, batch_size=BATCH_SIZE)
@@ -176,13 +180,12 @@ def test_dataloader(self):
176180

177181
# Initialize a trainer
178182
trainer = Trainer(
179-
accelerator="auto",
183+
accelerator=ACCELERATOR,
180184
# devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs
181185
max_epochs=3,
182186
callbacks=[TQDMProgressBar(refresh_rate=20)],
183187
num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)),
184188
devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)),
185-
replace_sampler_ddp=False,
186189
strategy="ddp",
187190
)
188191

test/e2e/mnist_pytorch_appwrapper_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ import (
3333
// Trains the MNIST dataset as a batch Job in an AppWrapper, and asserts successful completion of the training job.
3434
func TestMNISTPyTorchAppWrapper(t *testing.T) {
3535
test := With(t)
36-
test.T().Parallel()
3736

3837
// Create a namespace and localqueue in that namespace
3938
namespace := test.NewTestNamespace()
@@ -85,6 +84,7 @@ func TestMNISTPyTorchAppWrapper(t *testing.T) {
8584
{Name: "MNIST_DATASET_URL", Value: GetMnistDatasetURL()},
8685
{Name: "PIP_INDEX_URL", Value: GetPipIndexURL()},
8786
{Name: "PIP_TRUSTED_HOST", Value: GetPipTrustedHost()},
87+
{Name: "ACCELERATOR", Value: "gpu"},
8888
},
8989
Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"},
9090
VolumeMounts: []corev1.VolumeMount{

test/e2e/mnist_rayjob_raycluster_test.go

+6-4
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ import (
3838
// directly managed by Kueue, and asserts successful completion of the training job.
3939
func TestMNISTRayJobRayCluster(t *testing.T) {
4040
test := With(t)
41-
test.T().Parallel()
4241

4342
// Create a namespace and localqueue in that namespace
4443
namespace := test.NewTestNamespace()
@@ -91,7 +90,6 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
9190
// Same as TestMNISTRayJobRayCluster, except the RayCluster is wrapped in an AppWrapper
9291
func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) {
9392
test := With(t)
94-
test.T().Parallel()
9593

9694
// Create a namespace and localqueue in that namespace
9795
namespace := test.NewTestNamespace()
@@ -282,11 +280,13 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
282280
Resources: corev1.ResourceRequirements{
283281
Requests: corev1.ResourceList{
284282
corev1.ResourceCPU: resource.MustParse("250m"),
285-
corev1.ResourceMemory: resource.MustParse("256Mi"),
283+
corev1.ResourceMemory: resource.MustParse("1G"),
284+
"nvidia.com/gpu": resource.MustParse("1"),
286285
},
287286
Limits: corev1.ResourceList{
288287
corev1.ResourceCPU: resource.MustParse("1"),
289-
corev1.ResourceMemory: resource.MustParse("2G"),
288+
corev1.ResourceMemory: resource.MustParse("4G"),
289+
"nvidia.com/gpu": resource.MustParse("1"),
290290
},
291291
},
292292
},
@@ -320,6 +320,7 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
320320
MNIST_DATASET_URL: "` + GetMnistDatasetURL() + `"
321321
PIP_INDEX_URL: "` + GetPipIndexURL() + `"
322322
PIP_TRUSTED_HOST: "` + GetPipTrustedHost() + `"
323+
ACCELERATOR: "gpu"
323324
`,
324325
ClusterSelector: map[string]string{
325326
RayJobDefaultClusterSelectorKey: rayCluster.Name,
@@ -336,6 +337,7 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
336337
},
337338
},
338339
},
340+
EntrypointNumGpus: 1,
339341
},
340342
}
341343
}

test/e2e/setup.sh

+4-2
Original file line numberDiff line numberDiff line change
@@ -84,12 +84,14 @@ metadata:
8484
spec:
8585
namespaceSelector: {} # match all.
8686
resourceGroups:
87-
- coveredResources: ["cpu","memory"]
87+
- coveredResources: ["cpu","memory", "nvidia.com/gpu"]
8888
flavors:
8989
- name: "default-flavor"
9090
resources:
9191
- name: "cpu"
9292
nominalQuota: 4
9393
- name: "memory"
94-
nominalQuota: "4G"
94+
nominalQuota: "20G"
95+
- name: "nvidia.com/gpu"
96+
nominalQuota: "1"
9597
EOF

0 commit comments

Comments
 (0)