Skip to content

Commit 4b8e58c

Browse files
committed
GPU test
1 parent cbbee9d commit 4b8e58c

7 files changed

+123
-203
lines changed

.github/workflows/e2e_tests.yaml

+12-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ concurrency:
2727
jobs:
2828
kubernetes-e2e:
2929

30-
runs-on: ubuntu-20.04-4core
30+
runs-on: ubuntu-20.04-4core-gpu
3131

3232
steps:
3333
- name: Checkout code
@@ -52,6 +52,17 @@ jobs:
5252
with:
5353
token: ${{ secrets.GITHUB_TOKEN }}
5454

55+
- name: Install Podman
56+
run: |
57+
# cat /etc/needrestart/needrestart.conf
58+
# sudo sed -i "s/#$nrconf{restart} = 'i';/#$nrconf{restart} = 'a';/" /etc/needrestart/needrestart.conf
59+
sudo apt-get -y install podman dbus-user-session
60+
sudo systemctl --user start dbus
61+
# To avoid Error: error creating build container: loading drop-in registries configuration "/etc/containers/registries.conf.d/local.conf": open /etc/containers/registries.conf.d/local.conf: permission denied
62+
mkdir ~/.config
63+
cp -r /etc/containers ~/.config/containers
64+
# sudo chmod --recursive 777 /etc/containers
65+
5566
- name: Setup and start KinD cluster
5667
uses: ./common/github-actions/kind
5768

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# BEGIN -- workaround lack of go-toolset for golang 1.21
44

5-
ARG GOLANG_IMAGE=golang:1.21
5+
ARG GOLANG_IMAGE=docker.io/library/golang:1.21
66

77
ARG GOARCH=amd64
88

test/e2e/mnist.py

-190
This file was deleted.

test/e2e/mnist_fashion.py

+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# Copyright 2022 IBM, Red Hat
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import torch
16+
import torch.nn as nn
17+
import ray
18+
from torch.utils.data import DataLoader
19+
from torchvision import datasets
20+
from torchvision.transforms import ToTensor
21+
from ray.train.torch import TorchTrainer
22+
from ray.train import ScalingConfig
23+
24+
25+
def get_dataset():
26+
return datasets.FashionMNIST(
27+
root="/tmp/data",
28+
train=True,
29+
download=True,
30+
transform=ToTensor(),
31+
)
32+
33+
34+
class NeuralNetwork(nn.Module):
35+
def __init__(self):
36+
super().__init__()
37+
self.flatten = nn.Flatten()
38+
self.linear_relu_stack = nn.Sequential(
39+
nn.Linear(28 * 28, 512),
40+
nn.ReLU(),
41+
nn.Linear(512, 512),
42+
nn.ReLU(),
43+
nn.Linear(512, 10),
44+
)
45+
46+
def forward(self, inputs):
47+
inputs = self.flatten(inputs)
48+
logits = self.linear_relu_stack(inputs)
49+
return logits
50+
51+
52+
def get_dataset():
53+
return datasets.FashionMNIST(
54+
root="/tmp/data",
55+
train=True,
56+
download=True,
57+
transform=ToTensor(),
58+
)
59+
60+
61+
def train_func_distributed():
62+
num_epochs = 3
63+
batch_size = 64
64+
65+
dataset = get_dataset()
66+
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
67+
dataloader = ray.train.torch.prepare_data_loader(dataloader)
68+
69+
model = NeuralNetwork()
70+
model = ray.train.torch.prepare_model(model)
71+
72+
criterion = nn.CrossEntropyLoss()
73+
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
74+
75+
for epoch in range(num_epochs):
76+
if ray.train.get_context().get_world_size() > 1:
77+
dataloader.sampler.set_epoch(epoch)
78+
79+
for inputs, labels in dataloader:
80+
optimizer.zero_grad()
81+
pred = model(inputs)
82+
loss = criterion(pred, labels)
83+
loss.backward()
84+
optimizer.step()
85+
print(f"epoch: {epoch}, loss: {loss.item()}")
86+
87+
88+
# For GPU Training, set `use_gpu` to True.
89+
use_gpu = True
90+
91+
trainer = TorchTrainer(
92+
train_func_distributed,
93+
scaling_config=ScalingConfig(
94+
num_workers=3, use_gpu=use_gpu
95+
), # num_workers = number of worker nodes with the ray head node included
96+
)
97+
98+
results = trainer.fit()

test/e2e/mnist_pip_requirements.txt

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1-
pytorch_lightning==1.5.10
2-
torchmetrics==0.9.1
3-
torchvision==0.12.0
1+
pytorch_lightning==2.2.5
2+
ray_lightning
3+
torchmetrics==1.4.0
4+
torchvision==0.18.0

test/e2e/mnist_pytorch_appwrapper_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ func TestMNISTPyTorchAppWrapper(t *testing.T) {
5353
// pip requirements
5454
"requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"),
5555
// MNIST training script
56-
"mnist.py": ReadFile(test, "mnist.py"),
56+
"mnist_fashion.py": ReadFile(test, "mnist_fashion.py"),
5757
},
5858
Immutable: Ptr(true),
5959
}
@@ -86,7 +86,7 @@ func TestMNISTPyTorchAppWrapper(t *testing.T) {
8686
{Name: "PIP_INDEX_URL", Value: GetPipIndexURL()},
8787
{Name: "PIP_TRUSTED_HOST", Value: GetPipTrustedHost()},
8888
},
89-
Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"},
89+
Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist_fashion.py"},
9090
VolumeMounts: []corev1.VolumeMount{
9191
{
9292
Name: "test",

test/e2e/mnist_rayjob_raycluster_test.go

+6-6
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ func constructMNISTConfigMap(test Test, namespace *corev1.Namespace) *corev1.Con
177177
Namespace: namespace.Name,
178178
},
179179
BinaryData: map[string][]byte{
180-
"mnist.py": ReadFile(test, "mnist.py"),
180+
"mnist_fashion.py": ReadFile(test, "mnist_fashion.py"),
181181
},
182182
Immutable: Ptr(true),
183183
}
@@ -229,11 +229,11 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
229229
Resources: corev1.ResourceRequirements{
230230
Requests: corev1.ResourceList{
231231
corev1.ResourceCPU: resource.MustParse("250m"),
232-
corev1.ResourceMemory: resource.MustParse("512Mi"),
232+
corev1.ResourceMemory: resource.MustParse("1G"),
233233
},
234234
Limits: corev1.ResourceList{
235235
corev1.ResourceCPU: resource.MustParse("1"),
236-
corev1.ResourceMemory: resource.MustParse("2G"),
236+
corev1.ResourceMemory: resource.MustParse("4G"),
237237
},
238238
},
239239
VolumeMounts: []corev1.VolumeMount{
@@ -282,11 +282,11 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
282282
Resources: corev1.ResourceRequirements{
283283
Requests: corev1.ResourceList{
284284
corev1.ResourceCPU: resource.MustParse("250m"),
285-
corev1.ResourceMemory: resource.MustParse("256Mi"),
285+
corev1.ResourceMemory: resource.MustParse("1G"),
286286
},
287287
Limits: corev1.ResourceList{
288288
corev1.ResourceCPU: resource.MustParse("1"),
289-
corev1.ResourceMemory: resource.MustParse("2G"),
289+
corev1.ResourceMemory: resource.MustParse("4G"),
290290
},
291291
},
292292
},
@@ -310,7 +310,7 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
310310
Namespace: namespace.Name,
311311
},
312312
Spec: rayv1.RayJobSpec{
313-
Entrypoint: "python /home/ray/jobs/mnist.py",
313+
Entrypoint: "python /home/ray/jobs/mnist_fashion.py",
314314
RuntimeEnvYAML: `
315315
pip:
316316
- pytorch_lightning==1.5.10

0 commit comments

Comments
 (0)