GPU test

sutaakar · sutaakar · commit 4b8e58cdde75 · 2024-06-20T15:22:29.000+02:00
diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml
@@ -27,7 +27,7 @@ concurrency:
 jobs:
   kubernetes-e2e:
 
-    runs-on: ubuntu-20.04-4core
+    runs-on: ubuntu-20.04-4core-gpu
 
     steps:
       - name: Checkout code
@@ -52,6 +52,17 @@ jobs:
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Install Podman
+        run: |
+          # cat /etc/needrestart/needrestart.conf
+          # sudo sed -i "s/#$nrconf{restart} = 'i';/#$nrconf{restart} = 'a';/" /etc/needrestart/needrestart.conf
+          sudo apt-get -y install podman dbus-user-session
+          sudo systemctl --user start dbus
+          # To avoid Error: error creating build container: loading drop-in registries configuration "/etc/containers/registries.conf.d/local.conf": open /etc/containers/registries.conf.d/local.conf: permission denied
+          mkdir ~/.config
+          cp -r /etc/containers ~/.config/containers
+          # sudo chmod --recursive 777 /etc/containers
+
       - name: Setup and start KinD cluster
         uses: ./common/github-actions/kind
 
diff --git a/Dockerfile b/Dockerfile
@@ -2,7 +2,7 @@
 
 # BEGIN -- workaround lack of go-toolset for golang 1.21
 
-ARG GOLANG_IMAGE=golang:1.21
+ARG GOLANG_IMAGE=docker.io/library/golang:1.21
 
 ARG GOARCH=amd64
 
diff --git a/test/e2e/mnist.py b/test/e2e/mnist.py
diff --git a/test/e2e/mnist_fashion.py b/test/e2e/mnist_fashion.py
@@ -0,0 +1,98 @@
+# Copyright 2022 IBM, Red Hat
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import ray
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+from ray.train.torch import TorchTrainer
+from ray.train import ScalingConfig
+
+
+def get_dataset():
+    return datasets.FashionMNIST(
+        root="/tmp/data",
+        train=True,
+        download=True,
+        transform=ToTensor(),
+    )
+
+
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28 * 28, 512),
+            nn.ReLU(),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Linear(512, 10),
+        )
+
+    def forward(self, inputs):
+        inputs = self.flatten(inputs)
+        logits = self.linear_relu_stack(inputs)
+        return logits
+
+
+def get_dataset():
+    return datasets.FashionMNIST(
+        root="/tmp/data",
+        train=True,
+        download=True,
+        transform=ToTensor(),
+    )
+
+
+def train_func_distributed():
+    num_epochs = 3
+    batch_size = 64
+
+    dataset = get_dataset()
+    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+    dataloader = ray.train.torch.prepare_data_loader(dataloader)
+
+    model = NeuralNetwork()
+    model = ray.train.torch.prepare_model(model)
+
+    criterion = nn.CrossEntropyLoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+
+    for epoch in range(num_epochs):
+        if ray.train.get_context().get_world_size() > 1:
+            dataloader.sampler.set_epoch(epoch)
+
+        for inputs, labels in dataloader:
+            optimizer.zero_grad()
+            pred = model(inputs)
+            loss = criterion(pred, labels)
+            loss.backward()
+            optimizer.step()
+        print(f"epoch: {epoch}, loss: {loss.item()}")
+
+
+# For GPU Training, set `use_gpu` to True.
+use_gpu = True
+
+trainer = TorchTrainer(
+    train_func_distributed,
+    scaling_config=ScalingConfig(
+        num_workers=3, use_gpu=use_gpu
+    ),  # num_workers = number of worker nodes with the ray head node included
+)
+
+results = trainer.fit()
diff --git a/test/e2e/mnist_pip_requirements.txt b/test/e2e/mnist_pip_requirements.txt
@@ -1,3 +1,4 @@
-pytorch_lightning==1.5.10
-torchmetrics==0.9.1
-torchvision==0.12.0
+pytorch_lightning==2.2.5
+ray_lightning
+torchmetrics==1.4.0
+torchvision==0.18.0
diff --git a/test/e2e/mnist_pytorch_appwrapper_test.go b/test/e2e/mnist_pytorch_appwrapper_test.go
@@ -53,7 +53,7 @@ func TestMNISTPyTorchAppWrapper(t *testing.T) {
 			// pip requirements
 			"requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"),
 			// MNIST training script
-			"mnist.py": ReadFile(test, "mnist.py"),
+			"mnist_fashion.py": ReadFile(test, "mnist_fashion.py"),
 		},
 		Immutable: Ptr(true),
 	}
@@ -86,7 +86,7 @@ func TestMNISTPyTorchAppWrapper(t *testing.T) {
 								{Name: "PIP_INDEX_URL", Value: GetPipIndexURL()},
 								{Name: "PIP_TRUSTED_HOST", Value: GetPipTrustedHost()},
 							},
-							Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"},
+							Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist_fashion.py"},
 							VolumeMounts: []corev1.VolumeMount{
 								{
 									Name:      "test",
diff --git a/test/e2e/mnist_rayjob_raycluster_test.go b/test/e2e/mnist_rayjob_raycluster_test.go
@@ -177,7 +177,7 @@ func constructMNISTConfigMap(test Test, namespace *corev1.Namespace) *corev1.Con
 			Namespace: namespace.Name,
 		},
 		BinaryData: map[string][]byte{
-			"mnist.py": ReadFile(test, "mnist.py"),
+			"mnist_fashion.py": ReadFile(test, "mnist_fashion.py"),
 		},
 		Immutable: Ptr(true),
 	}
@@ -229,11 +229,11 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
 								Resources: corev1.ResourceRequirements{
 									Requests: corev1.ResourceList{
 										corev1.ResourceCPU:    resource.MustParse("250m"),
-										corev1.ResourceMemory: resource.MustParse("512Mi"),
+										corev1.ResourceMemory: resource.MustParse("1G"),
 									},
 									Limits: corev1.ResourceList{
 										corev1.ResourceCPU:    resource.MustParse("1"),
-										corev1.ResourceMemory: resource.MustParse("2G"),
+										corev1.ResourceMemory: resource.MustParse("4G"),
 									},
 								},
 								VolumeMounts: []corev1.VolumeMount{
@@ -282,11 +282,11 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
 									Resources: corev1.ResourceRequirements{
 										Requests: corev1.ResourceList{
 											corev1.ResourceCPU:    resource.MustParse("250m"),
-											corev1.ResourceMemory: resource.MustParse("256Mi"),
+											corev1.ResourceMemory: resource.MustParse("1G"),
 										},
 										Limits: corev1.ResourceList{
 											corev1.ResourceCPU:    resource.MustParse("1"),
-											corev1.ResourceMemory: resource.MustParse("2G"),
+											corev1.ResourceMemory: resource.MustParse("4G"),
 										},
 									},
 								},
@@ -310,7 +310,7 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
 			Namespace: namespace.Name,
 		},
 		Spec: rayv1.RayJobSpec{
-			Entrypoint: "python /home/ray/jobs/mnist.py",
+			Entrypoint: "python /home/ray/jobs/mnist_fashion.py",
 			RuntimeEnvYAML: `
   pip:
     - pytorch_lightning==1.5.10