diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index d84ce09c2..23a044fe0 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -27,7 +27,7 @@ concurrency: jobs: kubernetes-e2e: - runs-on: ubuntu-20.04-4core + runs-on: ubuntu-20.04-4core-gpu steps: - name: Checkout code @@ -52,9 +52,15 @@ jobs: with: token: ${{ secrets.GITHUB_TOKEN }} + - name: Setup NVidia GPU environment for KinD + uses: ./common/github-actions/nvidia-gpu-setup + - name: Setup and start KinD cluster uses: ./common/github-actions/kind + - name: Install NVidia GPU operator for KinD + uses: ./common/github-actions/nvidia-gpu-operator + - name: Deploy CodeFlare stack id: deploy run: | @@ -62,23 +68,25 @@ jobs: make setup-e2e echo Deploying CodeFlare operator - IMG="${REGISTRY_ADDRESS}"/codeflare-operator - make image-push -e IMG="${IMG}" + IMG=localhost/codeflare-operator:test + make image-build -e IMG="${IMG}" + podman save -o cfo.tar ${IMG} + kind load image-archive cfo.tar --name cluster --verbosity 1000 make deploy -e IMG="${IMG}" -e ENV="e2e" kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager - name: Run e2e tests run: | - export CODEFLARE_TEST_TIMEOUT_SHORT=1m - export CODEFLARE_TEST_TIMEOUT_MEDIUM=5m - export CODEFLARE_TEST_TIMEOUT_LONG=10m + export CODEFLARE_TEST_TIMEOUT_SHORT=3m + export CODEFLARE_TEST_TIMEOUT_MEDIUM=10m + export CODEFLARE_TEST_TIMEOUT_LONG=20m export CODEFLARE_TEST_TIMEOUT_GPU_PROVISIONING=30m export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV set -euo pipefail - go test -timeout 30m -v ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt + go test -timeout 60m -v ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt - name: Print CodeFlare operator logs if: always() && steps.deploy.outcome == 'success' diff --git a/Dockerfile b/Dockerfile index e354480ea..72e842059 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ # BEGIN -- workaround lack of go-toolset for golang 1.22 -ARG GOLANG_IMAGE=golang:1.22 +ARG GOLANG_IMAGE=docker.io/library/golang:1.22 ARG GOARCH=amd64 diff --git a/config/e2e/patch_resources.yaml b/config/e2e/patch_resources.yaml index 3da1d5f3a..c3e91ae23 100644 --- a/config/e2e/patch_resources.yaml +++ b/config/e2e/patch_resources.yaml @@ -1,2 +1,5 @@ - op: remove path: /spec/template/spec/containers/0/resources +- op: replace + path: /spec/template/spec/containers/0/imagePullPolicy + value: IfNotPresent diff --git a/test/e2e/mnist.py b/test/e2e/mnist.py index 5ac266652..5a89a8b38 100644 --- a/test/e2e/mnist.py +++ b/test/e2e/mnist.py @@ -20,7 +20,7 @@ from pytorch_lightning.callbacks.progress import TQDMProgressBar from torch import nn from torch.nn import functional as F -from torch.utils.data import DataLoader, random_split, RandomSampler +from torch.utils.data import DataLoader, random_split from torchmetrics import Accuracy from torchvision import transforms from torchvision.datasets import MNIST @@ -36,6 +36,9 @@ print("MNIST_DATASET_URL: is ", os.getenv("MNIST_DATASET_URL")) MNIST_DATASET_URL = os.getenv("MNIST_DATASET_URL") +print("ACCELERATOR: is ", os.getenv("ACCELERATOR")) +ACCELERATOR = os.getenv("ACCELERATOR") + class LitMNIST(LightningModule): def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): @@ -158,7 +161,7 @@ def setup(self, stage=None): ) def train_dataloader(self): - return DataLoader(self.mnist_train, batch_size=BATCH_SIZE, sampler=RandomSampler(self.mnist_train, num_samples=1000)) + return DataLoader(self.mnist_train, batch_size=BATCH_SIZE) def val_dataloader(self): return DataLoader(self.mnist_val, batch_size=BATCH_SIZE) @@ -176,13 +179,12 @@ def test_dataloader(self): # Initialize a trainer trainer = Trainer( - accelerator="auto", + accelerator=ACCELERATOR, # devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs max_epochs=3, callbacks=[TQDMProgressBar(refresh_rate=20)], num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)), devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)), - replace_sampler_ddp=False, strategy="ddp", ) diff --git a/test/e2e/mnist_pytorch_appwrapper_test.go b/test/e2e/mnist_pytorch_appwrapper_test.go index e918b15b1..94239f57c 100644 --- a/test/e2e/mnist_pytorch_appwrapper_test.go +++ b/test/e2e/mnist_pytorch_appwrapper_test.go @@ -30,10 +30,17 @@ import ( "k8s.io/apimachinery/pkg/runtime" ) +func TestMnistPyTorchAppWrapperCpu(t *testing.T) { + runMnistPyTorchAppWrapper(t, "cpu") +} + +func TestMnistPyTorchAppWrapperGpu(t *testing.T) { + runMnistPyTorchAppWrapper(t, "gpu") +} + // Trains the MNIST dataset as a batch Job in an AppWrapper, and asserts successful completion of the training job. -func TestMNISTPyTorchAppWrapper(t *testing.T) { +func runMnistPyTorchAppWrapper(t *testing.T, accelerator string) { test := With(t) - test.T().Parallel() // Create a namespace and localqueue in that namespace namespace := test.NewTestNamespace() @@ -85,6 +92,7 @@ func TestMNISTPyTorchAppWrapper(t *testing.T) { {Name: "MNIST_DATASET_URL", Value: GetMnistDatasetURL()}, {Name: "PIP_INDEX_URL", Value: GetPipIndexURL()}, {Name: "PIP_TRUSTED_HOST", Value: GetPipTrustedHost()}, + {Name: "ACCELERATOR", Value: accelerator}, }, Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"}, VolumeMounts: []corev1.VolumeMount{ diff --git a/test/e2e/mnist_rayjob_raycluster_test.go b/test/e2e/mnist_rayjob_raycluster_test.go index 5bf5e9324..0f2490c21 100644 --- a/test/e2e/mnist_rayjob_raycluster_test.go +++ b/test/e2e/mnist_rayjob_raycluster_test.go @@ -18,6 +18,7 @@ package e2e import ( "crypto/tls" + "fmt" "net/http" "net/url" "testing" @@ -36,9 +37,17 @@ import ( // Trains the MNIST dataset as a RayJob, executed by a Ray cluster // directly managed by Kueue, and asserts successful completion of the training job. -func TestMNISTRayJobRayCluster(t *testing.T) { + +func TestMnistRayJobRayClusterCpu(t *testing.T) { + runMnistRayJobRayCluster(t, "cpu", 0) +} + +func TestMnistRayJobRayClusterGpu(t *testing.T) { + runMnistRayJobRayCluster(t, "gpu", 1) +} + +func runMnistRayJobRayCluster(t *testing.T, accelerator string, numberOfGpus int) { test := With(t) - test.T().Parallel() // Create a namespace and localqueue in that namespace namespace := test.NewTestNamespace() @@ -51,7 +60,7 @@ func TestMNISTRayJobRayCluster(t *testing.T) { test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name) // Create RayCluster and assign it to the localqueue - rayCluster := constructRayCluster(test, namespace, mnist) + rayCluster := constructRayCluster(test, namespace, mnist, numberOfGpus) AssignToLocalQueue(rayCluster, localQueue) rayCluster, err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Create(test.Ctx(), rayCluster, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) @@ -62,7 +71,7 @@ func TestMNISTRayJobRayCluster(t *testing.T) { Should(WithTransform(RayClusterState, Equal(rayv1.Ready))) // Create RayJob - rayJob := constructRayJob(test, namespace, rayCluster) + rayJob := constructRayJob(test, namespace, rayCluster, accelerator, numberOfGpus) rayJob, err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) @@ -88,10 +97,17 @@ func TestMNISTRayJobRayCluster(t *testing.T) { To(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded))) } +func TestMnistRayJobRayClusterAppWrapperCpu(t *testing.T) { + runMnistRayJobRayClusterAppWrapper(t, "cpu", 0) +} + +func TestMnistRayJobRayClusterAppWrapperGpu(t *testing.T) { + runMnistRayJobRayClusterAppWrapper(t, "gpu", 1) +} + // Same as TestMNISTRayJobRayCluster, except the RayCluster is wrapped in an AppWrapper -func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) { +func runMnistRayJobRayClusterAppWrapper(t *testing.T, accelerator string, numberOfGpus int) { test := With(t) - test.T().Parallel() // Create a namespace and localqueue in that namespace namespace := test.NewTestNamespace() @@ -104,7 +120,7 @@ func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) { test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name) // Create RayCluster, wrap in AppWrapper and assign to localqueue - rayCluster := constructRayCluster(test, namespace, mnist) + rayCluster := constructRayCluster(test, namespace, mnist, numberOfGpus) aw := &mcadv1beta2.AppWrapper{ TypeMeta: metav1.TypeMeta{ APIVersion: mcadv1beta2.GroupVersion.String(), @@ -140,7 +156,7 @@ func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) { Should(WithTransform(RayClusterState, Equal(rayv1.Ready))) // Create RayJob - rayJob := constructRayJob(test, namespace, rayCluster) + rayJob := constructRayJob(test, namespace, rayCluster, accelerator, numberOfGpus) rayJob, err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) @@ -183,7 +199,7 @@ func constructMNISTConfigMap(test Test, namespace *corev1.Namespace) *corev1.Con } } -func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.ConfigMap) *rayv1.RayCluster { +func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.ConfigMap, numberOfGpus int) *rayv1.RayCluster { return &rayv1.RayCluster{ TypeMeta: metav1.TypeMeta{ APIVersion: rayv1.GroupVersion.String(), @@ -236,24 +252,6 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf corev1.ResourceMemory: resource.MustParse("2G"), }, }, - VolumeMounts: []corev1.VolumeMount{ - { - Name: "mnist", - MountPath: "/home/ray/jobs", - }, - }, - }, - }, - Volumes: []corev1.Volume{ - { - Name: "mnist", - VolumeSource: corev1.VolumeSource{ - ConfigMap: &corev1.ConfigMapVolumeSource{ - LocalObjectReference: corev1.LocalObjectReference{ - Name: mnist.Name, - }, - }, - }, }, }, }, @@ -282,11 +280,31 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("250m"), - corev1.ResourceMemory: resource.MustParse("256Mi"), + corev1.ResourceMemory: resource.MustParse("1G"), + "nvidia.com/gpu": resource.MustParse(fmt.Sprint(numberOfGpus)), }, Limits: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("1"), - corev1.ResourceMemory: resource.MustParse("2G"), + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4G"), + "nvidia.com/gpu": resource.MustParse(fmt.Sprint(numberOfGpus)), + }, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "mnist", + MountPath: "/home/ray/jobs", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "mnist", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: mnist.Name, + }, }, }, }, @@ -299,7 +317,7 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf } } -func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayCluster) *rayv1.RayJob { +func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayCluster, accelerator string, numberOfGpus int) *rayv1.RayJob { return &rayv1.RayJob{ TypeMeta: metav1.TypeMeta{ APIVersion: rayv1.GroupVersion.String(), @@ -320,6 +338,7 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC MNIST_DATASET_URL: "` + GetMnistDatasetURL() + `" PIP_INDEX_URL: "` + GetPipIndexURL() + `" PIP_TRUSTED_HOST: "` + GetPipTrustedHost() + `" + ACCELERATOR: "` + accelerator + `" `, ClusterSelector: map[string]string{ RayJobDefaultClusterSelectorKey: rayCluster.Name, @@ -336,6 +355,9 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC }, }, }, + EntrypointNumCpus: 2, + // Using EntrypointNumGpus doesn't seem to work properly on KinD cluster with GPU, EntrypointNumCpus seems reliable + EntrypointNumGpus: float32(numberOfGpus), }, } } diff --git a/test/e2e/setup.sh b/test/e2e/setup.sh index 8d0bb2eee..a7f442e55 100755 --- a/test/e2e/setup.sh +++ b/test/e2e/setup.sh @@ -84,12 +84,14 @@ metadata: spec: namespaceSelector: {} # match all. resourceGroups: - - coveredResources: ["cpu","memory"] + - coveredResources: ["cpu","memory", "nvidia.com/gpu"] flavors: - name: "default-flavor" resources: - name: "cpu" nominalQuota: 4 - name: "memory" - nominalQuota: "4G" + nominalQuota: "20G" + - name: "nvidia.com/gpu" + nominalQuota: "1" EOF