Skip to content

[RHOAIENG-9004] Adjust existing test and workflow for GPU testing #575

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions .github/workflows/e2e_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ concurrency:
jobs:
kubernetes-e2e:

runs-on: ubuntu-20.04-4core
runs-on: ubuntu-20.04-4core-gpu

steps:
- name: Checkout code
Expand All @@ -52,33 +52,41 @@ jobs:
with:
token: ${{ secrets.GITHUB_TOKEN }}

- name: Setup NVidia GPU environment for KinD
uses: ./common/github-actions/nvidia-gpu-setup

- name: Setup and start KinD cluster
uses: ./common/github-actions/kind

- name: Install NVidia GPU operator for KinD
uses: ./common/github-actions/nvidia-gpu-operator

- name: Deploy CodeFlare stack
id: deploy
run: |
echo Setting up CodeFlare stack
make setup-e2e

echo Deploying CodeFlare operator
IMG="${REGISTRY_ADDRESS}"/codeflare-operator
make image-push -e IMG="${IMG}"
IMG=localhost/codeflare-operator:test
make image-build -e IMG="${IMG}"
podman save -o cfo.tar ${IMG}
kind load image-archive cfo.tar --name cluster --verbosity 1000
make deploy -e IMG="${IMG}" -e ENV="e2e"
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager

- name: Run e2e tests
run: |
export CODEFLARE_TEST_TIMEOUT_SHORT=1m
export CODEFLARE_TEST_TIMEOUT_MEDIUM=5m
export CODEFLARE_TEST_TIMEOUT_LONG=10m
export CODEFLARE_TEST_TIMEOUT_SHORT=3m
export CODEFLARE_TEST_TIMEOUT_MEDIUM=10m
export CODEFLARE_TEST_TIMEOUT_LONG=20m
export CODEFLARE_TEST_TIMEOUT_GPU_PROVISIONING=30m

export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV

set -euo pipefail
go test -timeout 30m -v ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
go test -timeout 60m -v ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt

- name: Print CodeFlare operator logs
if: always() && steps.deploy.outcome == 'success'
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# BEGIN -- workaround lack of go-toolset for golang 1.22

ARG GOLANG_IMAGE=golang:1.22
ARG GOLANG_IMAGE=docker.io/library/golang:1.22

ARG GOARCH=amd64

Expand Down
3 changes: 3 additions & 0 deletions config/e2e/patch_resources.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
- op: remove
path: /spec/template/spec/containers/0/resources
- op: replace
path: /spec/template/spec/containers/0/imagePullPolicy
value: IfNotPresent
10 changes: 6 additions & 4 deletions test/e2e/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from pytorch_lightning.callbacks.progress import TQDMProgressBar
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split, RandomSampler
from torch.utils.data import DataLoader, random_split
from torchmetrics import Accuracy
from torchvision import transforms
from torchvision.datasets import MNIST
Expand All @@ -36,6 +36,9 @@
print("MNIST_DATASET_URL: is ", os.getenv("MNIST_DATASET_URL"))
MNIST_DATASET_URL = os.getenv("MNIST_DATASET_URL")

print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
ACCELERATOR = os.getenv("ACCELERATOR")

class LitMNIST(LightningModule):
def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):

Expand Down Expand Up @@ -158,7 +161,7 @@ def setup(self, stage=None):
)

def train_dataloader(self):
return DataLoader(self.mnist_train, batch_size=BATCH_SIZE, sampler=RandomSampler(self.mnist_train, num_samples=1000))
return DataLoader(self.mnist_train, batch_size=BATCH_SIZE)

def val_dataloader(self):
return DataLoader(self.mnist_val, batch_size=BATCH_SIZE)
Expand All @@ -176,13 +179,12 @@ def test_dataloader(self):

# Initialize a trainer
trainer = Trainer(
accelerator="auto",
accelerator=ACCELERATOR,
# devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs
max_epochs=3,
callbacks=[TQDMProgressBar(refresh_rate=20)],
num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)),
devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)),
replace_sampler_ddp=False,
strategy="ddp",
)

Expand Down
12 changes: 10 additions & 2 deletions test/e2e/mnist_pytorch_appwrapper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,17 @@ import (
"k8s.io/apimachinery/pkg/runtime"
)

func TestMnistPyTorchAppWrapperCpu(t *testing.T) {
runMnistPyTorchAppWrapper(t, "cpu")
}

func TestMnistPyTorchAppWrapperGpu(t *testing.T) {
runMnistPyTorchAppWrapper(t, "gpu")
}

// Trains the MNIST dataset as a batch Job in an AppWrapper, and asserts successful completion of the training job.
func TestMNISTPyTorchAppWrapper(t *testing.T) {
func runMnistPyTorchAppWrapper(t *testing.T, accelerator string) {
test := With(t)
test.T().Parallel()

// Create a namespace and localqueue in that namespace
namespace := test.NewTestNamespace()
Expand Down Expand Up @@ -85,6 +92,7 @@ func TestMNISTPyTorchAppWrapper(t *testing.T) {
{Name: "MNIST_DATASET_URL", Value: GetMnistDatasetURL()},
{Name: "PIP_INDEX_URL", Value: GetPipIndexURL()},
{Name: "PIP_TRUSTED_HOST", Value: GetPipTrustedHost()},
{Name: "ACCELERATOR", Value: accelerator},
},
Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"},
VolumeMounts: []corev1.VolumeMount{
Expand Down
84 changes: 53 additions & 31 deletions test/e2e/mnist_rayjob_raycluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package e2e

import (
"crypto/tls"
"fmt"
"net/http"
"net/url"
"testing"
Expand All @@ -36,9 +37,17 @@ import (

// Trains the MNIST dataset as a RayJob, executed by a Ray cluster
// directly managed by Kueue, and asserts successful completion of the training job.
func TestMNISTRayJobRayCluster(t *testing.T) {

func TestMnistRayJobRayClusterCpu(t *testing.T) {
runMnistRayJobRayCluster(t, "cpu", 0)
}

func TestMnistRayJobRayClusterGpu(t *testing.T) {
runMnistRayJobRayCluster(t, "gpu", 1)
}

func runMnistRayJobRayCluster(t *testing.T, accelerator string, numberOfGpus int) {
test := With(t)
test.T().Parallel()

// Create a namespace and localqueue in that namespace
namespace := test.NewTestNamespace()
Expand All @@ -51,7 +60,7 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name)

// Create RayCluster and assign it to the localqueue
rayCluster := constructRayCluster(test, namespace, mnist)
rayCluster := constructRayCluster(test, namespace, mnist, numberOfGpus)
AssignToLocalQueue(rayCluster, localQueue)
rayCluster, err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Create(test.Ctx(), rayCluster, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
Expand All @@ -62,7 +71,7 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))

// Create RayJob
rayJob := constructRayJob(test, namespace, rayCluster)
rayJob := constructRayJob(test, namespace, rayCluster, accelerator, numberOfGpus)
rayJob, err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
Expand All @@ -88,10 +97,17 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
To(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded)))
}

func TestMnistRayJobRayClusterAppWrapperCpu(t *testing.T) {
runMnistRayJobRayClusterAppWrapper(t, "cpu", 0)
}

func TestMnistRayJobRayClusterAppWrapperGpu(t *testing.T) {
runMnistRayJobRayClusterAppWrapper(t, "gpu", 1)
}

// Same as TestMNISTRayJobRayCluster, except the RayCluster is wrapped in an AppWrapper
func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) {
func runMnistRayJobRayClusterAppWrapper(t *testing.T, accelerator string, numberOfGpus int) {
test := With(t)
test.T().Parallel()

// Create a namespace and localqueue in that namespace
namespace := test.NewTestNamespace()
Expand All @@ -104,7 +120,7 @@ func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) {
test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name)

// Create RayCluster, wrap in AppWrapper and assign to localqueue
rayCluster := constructRayCluster(test, namespace, mnist)
rayCluster := constructRayCluster(test, namespace, mnist, numberOfGpus)
aw := &mcadv1beta2.AppWrapper{
TypeMeta: metav1.TypeMeta{
APIVersion: mcadv1beta2.GroupVersion.String(),
Expand Down Expand Up @@ -140,7 +156,7 @@ func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) {
Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))

// Create RayJob
rayJob := constructRayJob(test, namespace, rayCluster)
rayJob := constructRayJob(test, namespace, rayCluster, accelerator, numberOfGpus)
rayJob, err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
Expand Down Expand Up @@ -183,7 +199,7 @@ func constructMNISTConfigMap(test Test, namespace *corev1.Namespace) *corev1.Con
}
}

func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.ConfigMap) *rayv1.RayCluster {
func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.ConfigMap, numberOfGpus int) *rayv1.RayCluster {
return &rayv1.RayCluster{
TypeMeta: metav1.TypeMeta{
APIVersion: rayv1.GroupVersion.String(),
Expand Down Expand Up @@ -236,24 +252,6 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
corev1.ResourceMemory: resource.MustParse("2G"),
},
},
VolumeMounts: []corev1.VolumeMount{
{
Name: "mnist",
MountPath: "/home/ray/jobs",
},
},
},
},
Volumes: []corev1.Volume{
{
Name: "mnist",
VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: mnist.Name,
},
},
},
},
},
},
Expand Down Expand Up @@ -282,11 +280,31 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("250m"),
corev1.ResourceMemory: resource.MustParse("256Mi"),
corev1.ResourceMemory: resource.MustParse("1G"),
"nvidia.com/gpu": resource.MustParse(fmt.Sprint(numberOfGpus)),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("2G"),
corev1.ResourceCPU: resource.MustParse("2"),
corev1.ResourceMemory: resource.MustParse("4G"),
"nvidia.com/gpu": resource.MustParse(fmt.Sprint(numberOfGpus)),
},
},
VolumeMounts: []corev1.VolumeMount{
{
Name: "mnist",
MountPath: "/home/ray/jobs",
},
},
},
},
Volumes: []corev1.Volume{
{
Name: "mnist",
VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: mnist.Name,
},
},
},
},
Expand All @@ -299,7 +317,7 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
}
}

func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayCluster) *rayv1.RayJob {
func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayCluster, accelerator string, numberOfGpus int) *rayv1.RayJob {
return &rayv1.RayJob{
TypeMeta: metav1.TypeMeta{
APIVersion: rayv1.GroupVersion.String(),
Expand All @@ -320,6 +338,7 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
MNIST_DATASET_URL: "` + GetMnistDatasetURL() + `"
PIP_INDEX_URL: "` + GetPipIndexURL() + `"
PIP_TRUSTED_HOST: "` + GetPipTrustedHost() + `"
ACCELERATOR: "` + accelerator + `"
`,
ClusterSelector: map[string]string{
RayJobDefaultClusterSelectorKey: rayCluster.Name,
Expand All @@ -336,6 +355,9 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
},
},
},
EntrypointNumCpus: 2,
// Using EntrypointNumGpus doesn't seem to work properly on KinD cluster with GPU, EntrypointNumCpus seems reliable
EntrypointNumGpus: float32(numberOfGpus),
},
}
}
Expand Down
6 changes: 4 additions & 2 deletions test/e2e/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,14 @@ metadata:
spec:
namespaceSelector: {} # match all.
resourceGroups:
- coveredResources: ["cpu","memory"]
- coveredResources: ["cpu","memory", "nvidia.com/gpu"]
flavors:
- name: "default-flavor"
resources:
- name: "cpu"
nominalQuota: 4
- name: "memory"
nominalQuota: "4G"
nominalQuota: "20G"
- name: "nvidia.com/gpu"
nominalQuota: "1"
EOF