[RHOAIENG-9004] Adjust existing test and workflow for GPU testing

sutaakar · sutaakar · commit 47098b494541 · 2024-07-02T08:49:26.000+02:00
diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml
@@ -27,7 +27,7 @@ concurrency:
 jobs:
   kubernetes-e2e:
 
-    runs-on: ubuntu-20.04-4core
+    runs-on: ubuntu-20.04-4core-gpu
 
     steps:
       - name: Checkout code
@@ -52,33 +52,41 @@ jobs:
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Setup NVidia GPU environment for KinD
+        uses: ./common/github-actions/nvidia-gpu-setup
+
       - name: Setup and start KinD cluster
         uses: ./common/github-actions/kind
 
+      - name: Install NVidia GPU operator for KinD
+        uses: ./common/github-actions/nvidia-gpu-operator
+
       - name: Deploy CodeFlare stack
         id: deploy
         run: |
           echo Setting up CodeFlare stack
           make setup-e2e
 
           echo Deploying CodeFlare operator
-          IMG="${REGISTRY_ADDRESS}"/codeflare-operator
-          make image-push -e IMG="${IMG}"
+          IMG=localhost/codeflare-operator:test
+          make image-build -e IMG="${IMG}"
+          podman save -o cfo.tar ${IMG}
+          kind load image-archive cfo.tar --name cluster --verbosity 1000
           make deploy -e IMG="${IMG}" -e ENV="e2e"
           kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
 
       - name: Run e2e tests
         run: |
-          export CODEFLARE_TEST_TIMEOUT_SHORT=1m
-          export CODEFLARE_TEST_TIMEOUT_MEDIUM=5m
-          export CODEFLARE_TEST_TIMEOUT_LONG=10m
+          export CODEFLARE_TEST_TIMEOUT_SHORT=3m
+          export CODEFLARE_TEST_TIMEOUT_MEDIUM=10m
+          export CODEFLARE_TEST_TIMEOUT_LONG=20m
           export CODEFLARE_TEST_TIMEOUT_GPU_PROVISIONING=30m
 
           export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
           echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
 
           set -euo pipefail
-          go test -timeout 30m -v ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
+          go test -timeout 60m -v ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
 
       - name: Print CodeFlare operator logs
         if: always() && steps.deploy.outcome == 'success'
diff --git a/Dockerfile b/Dockerfile
@@ -2,7 +2,7 @@
 
 # BEGIN -- workaround lack of go-toolset for golang 1.22
 
-ARG GOLANG_IMAGE=golang:1.22
+ARG GOLANG_IMAGE=docker.io/library/golang:1.22
 
 ARG GOARCH=amd64
 
diff --git a/config/e2e/patch_resources.yaml b/config/e2e/patch_resources.yaml
@@ -1,2 +1,5 @@
 - op: remove
   path: /spec/template/spec/containers/0/resources
+- op: replace
+  path: /spec/template/spec/containers/0/imagePullPolicy
+  value: IfNotPresent
diff --git a/test/e2e/mnist.py b/test/e2e/mnist.py
@@ -20,7 +20,7 @@
 from pytorch_lightning.callbacks.progress import TQDMProgressBar
 from torch import nn
 from torch.nn import functional as F
-from torch.utils.data import DataLoader, random_split, RandomSampler
+from torch.utils.data import DataLoader, random_split
 from torchmetrics import Accuracy
 from torchvision import transforms
 from torchvision.datasets import MNIST
@@ -36,6 +36,9 @@
 print("MNIST_DATASET_URL: is ", os.getenv("MNIST_DATASET_URL"))
 MNIST_DATASET_URL = os.getenv("MNIST_DATASET_URL")
 
+print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
+ACCELERATOR = os.getenv("ACCELERATOR")
+
 class LitMNIST(LightningModule):
     def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
 
@@ -158,7 +161,7 @@ def setup(self, stage=None):
             )
 
     def train_dataloader(self):
-        return DataLoader(self.mnist_train, batch_size=BATCH_SIZE, sampler=RandomSampler(self.mnist_train, num_samples=1000))
+        return DataLoader(self.mnist_train, batch_size=BATCH_SIZE)
 
     def val_dataloader(self):
         return DataLoader(self.mnist_val, batch_size=BATCH_SIZE)
@@ -176,13 +179,12 @@ def test_dataloader(self):
 
 # Initialize a trainer
 trainer = Trainer(
-    accelerator="auto",
+    accelerator=ACCELERATOR,
     # devices=1 if torch.cuda.is_available() else None,  # limiting got iPython runs
     max_epochs=3,
     callbacks=[TQDMProgressBar(refresh_rate=20)],
     num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)),
     devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)),
-    replace_sampler_ddp=False,
     strategy="ddp",
 )
 
diff --git a/test/e2e/mnist_pytorch_appwrapper_test.go b/test/e2e/mnist_pytorch_appwrapper_test.go
@@ -30,10 +30,17 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 )
 
+func TestMnistPyTorchAppWrapperCpu(t *testing.T) {
+	runMnistPyTorchAppWrapper(t, "cpu")
+}
+
+func TestMnistPyTorchAppWrapperGpu(t *testing.T) {
+	runMnistPyTorchAppWrapper(t, "gpu")
+}
+
 // Trains the MNIST dataset as a batch Job in an AppWrapper, and asserts successful completion of the training job.
-func TestMNISTPyTorchAppWrapper(t *testing.T) {
+func runMnistPyTorchAppWrapper(t *testing.T, accelerator string) {
 	test := With(t)
-	test.T().Parallel()
 
 	// Create a namespace and localqueue in that namespace
 	namespace := test.NewTestNamespace()
@@ -85,6 +92,7 @@ func TestMNISTPyTorchAppWrapper(t *testing.T) {
 								{Name: "MNIST_DATASET_URL", Value: GetMnistDatasetURL()},
 								{Name: "PIP_INDEX_URL", Value: GetPipIndexURL()},
 								{Name: "PIP_TRUSTED_HOST", Value: GetPipTrustedHost()},
+								{Name: "ACCELERATOR", Value: accelerator},
 							},
 							Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"},
 							VolumeMounts: []corev1.VolumeMount{
diff --git a/test/e2e/mnist_rayjob_raycluster_test.go b/test/e2e/mnist_rayjob_raycluster_test.go
@@ -18,6 +18,7 @@ package e2e
 
 import (
 	"crypto/tls"
+	"fmt"
 	"net/http"
 	"net/url"
 	"testing"
@@ -36,9 +37,17 @@ import (
 
 // Trains the MNIST dataset as a RayJob, executed by a Ray cluster
 // directly managed by Kueue, and asserts successful completion of the training job.
-func TestMNISTRayJobRayCluster(t *testing.T) {
+
+func TestMnistRayJobRayClusterCpu(t *testing.T) {
+	runMnistRayJobRayCluster(t, "cpu", 0)
+}
+
+func TestMnistRayJobRayClusterGpu(t *testing.T) {
+	runMnistRayJobRayCluster(t, "gpu", 1)
+}
+
+func runMnistRayJobRayCluster(t *testing.T, accelerator string, numberOfGpus int) {
 	test := With(t)
-	test.T().Parallel()
 
 	// Create a namespace and localqueue in that namespace
 	namespace := test.NewTestNamespace()
@@ -51,7 +60,7 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
 	test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name)
 
 	// Create RayCluster and assign it to the localqueue
-	rayCluster := constructRayCluster(test, namespace, mnist)
+	rayCluster := constructRayCluster(test, namespace, mnist, numberOfGpus)
 	AssignToLocalQueue(rayCluster, localQueue)
 	rayCluster, err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Create(test.Ctx(), rayCluster, metav1.CreateOptions{})
 	test.Expect(err).NotTo(HaveOccurred())
@@ -62,7 +71,7 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
 		Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))
 
 	// Create RayJob
-	rayJob := constructRayJob(test, namespace, rayCluster)
+	rayJob := constructRayJob(test, namespace, rayCluster, accelerator, numberOfGpus)
 	rayJob, err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{})
 	test.Expect(err).NotTo(HaveOccurred())
 	test.T().Logf("Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
@@ -88,10 +97,17 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
 		To(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded)))
 }
 
+func TestMnistRayJobRayClusterAppWrapperCpu(t *testing.T) {
+	runMnistRayJobRayClusterAppWrapper(t, "cpu", 0)
+}
+
+func TestMnistRayJobRayClusterAppWrapperGpu(t *testing.T) {
+	runMnistRayJobRayClusterAppWrapper(t, "gpu", 1)
+}
+
 // Same as TestMNISTRayJobRayCluster, except the RayCluster is wrapped in an AppWrapper
-func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) {
+func runMnistRayJobRayClusterAppWrapper(t *testing.T, accelerator string, numberOfGpus int) {
 	test := With(t)
-	test.T().Parallel()
 
 	// Create a namespace and localqueue in that namespace
 	namespace := test.NewTestNamespace()
@@ -104,7 +120,7 @@ func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) {
 	test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name)
 
 	// Create RayCluster, wrap in AppWrapper and assign to localqueue
-	rayCluster := constructRayCluster(test, namespace, mnist)
+	rayCluster := constructRayCluster(test, namespace, mnist, numberOfGpus)
 	aw := &mcadv1beta2.AppWrapper{
 		TypeMeta: metav1.TypeMeta{
 			APIVersion: mcadv1beta2.GroupVersion.String(),
@@ -140,7 +156,7 @@ func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) {
 		Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))
 
 	// Create RayJob
-	rayJob := constructRayJob(test, namespace, rayCluster)
+	rayJob := constructRayJob(test, namespace, rayCluster, accelerator, numberOfGpus)
 	rayJob, err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{})
 	test.Expect(err).NotTo(HaveOccurred())
 	test.T().Logf("Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
@@ -183,7 +199,7 @@ func constructMNISTConfigMap(test Test, namespace *corev1.Namespace) *corev1.Con
 	}
 }
 
-func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.ConfigMap) *rayv1.RayCluster {
+func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.ConfigMap, numberOfGpus int) *rayv1.RayCluster {
 	return &rayv1.RayCluster{
 		TypeMeta: metav1.TypeMeta{
 			APIVersion: rayv1.GroupVersion.String(),
@@ -236,24 +252,6 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
 										corev1.ResourceMemory: resource.MustParse("2G"),
 									},
 								},
-								VolumeMounts: []corev1.VolumeMount{
-									{
-										Name:      "mnist",
-										MountPath: "/home/ray/jobs",
-									},
-								},
-							},
-						},
-						Volumes: []corev1.Volume{
-							{
-								Name: "mnist",
-								VolumeSource: corev1.VolumeSource{
-									ConfigMap: &corev1.ConfigMapVolumeSource{
-										LocalObjectReference: corev1.LocalObjectReference{
-											Name: mnist.Name,
-										},
-									},
-								},
 							},
 						},
 					},
@@ -282,11 +280,31 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
 									Resources: corev1.ResourceRequirements{
 										Requests: corev1.ResourceList{
 											corev1.ResourceCPU:    resource.MustParse("250m"),
-											corev1.ResourceMemory: resource.MustParse("256Mi"),
+											corev1.ResourceMemory: resource.MustParse("1G"),
+											"nvidia.com/gpu":      resource.MustParse(fmt.Sprint(numberOfGpus)),
 										},
 										Limits: corev1.ResourceList{
-											corev1.ResourceCPU:    resource.MustParse("1"),
-											corev1.ResourceMemory: resource.MustParse("2G"),
+											corev1.ResourceCPU:    resource.MustParse("2"),
+											corev1.ResourceMemory: resource.MustParse("4G"),
+											"nvidia.com/gpu":      resource.MustParse(fmt.Sprint(numberOfGpus)),
+										},
+									},
+									VolumeMounts: []corev1.VolumeMount{
+										{
+											Name:      "mnist",
+											MountPath: "/home/ray/jobs",
+										},
+									},
+								},
+							},
+							Volumes: []corev1.Volume{
+								{
+									Name: "mnist",
+									VolumeSource: corev1.VolumeSource{
+										ConfigMap: &corev1.ConfigMapVolumeSource{
+											LocalObjectReference: corev1.LocalObjectReference{
+												Name: mnist.Name,
+											},
 										},
 									},
 								},
@@ -299,7 +317,7 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
 	}
 }
 
-func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayCluster) *rayv1.RayJob {
+func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayCluster, accelerator string, numberOfGpus int) *rayv1.RayJob {
 	return &rayv1.RayJob{
 		TypeMeta: metav1.TypeMeta{
 			APIVersion: rayv1.GroupVersion.String(),
@@ -320,6 +338,7 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
     MNIST_DATASET_URL: "` + GetMnistDatasetURL() + `"
     PIP_INDEX_URL: "` + GetPipIndexURL() + `"
     PIP_TRUSTED_HOST: "` + GetPipTrustedHost() + `"
+    ACCELERATOR: "` + accelerator + `"
 `,
 			ClusterSelector: map[string]string{
 				RayJobDefaultClusterSelectorKey: rayCluster.Name,
@@ -336,6 +355,9 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
 					},
 				},
 			},
+			EntrypointNumCpus: 2,
+			// Using EntrypointNumGpus doesn't seem to work properly on KinD cluster with GPU, EntrypointNumCpus seems reliable
+			EntrypointNumGpus: float32(numberOfGpus),
 		},
 	}
 }
diff --git a/test/e2e/setup.sh b/test/e2e/setup.sh
@@ -84,12 +84,14 @@ metadata:
 spec:
   namespaceSelector: {} # match all.
   resourceGroups:
-  - coveredResources: ["cpu","memory"]
+  - coveredResources: ["cpu","memory", "nvidia.com/gpu"]
     flavors:
     - name: "default-flavor"
       resources:
       - name: "cpu"
         nominalQuota: 4
       - name: "memory"
-        nominalQuota: "4G"
+        nominalQuota: "20G"
+      - name: "nvidia.com/gpu"
+        nominalQuota: "1"
 EOF