@@ -18,6 +18,7 @@ package e2e
18
18
19
19
import (
20
20
"crypto/tls"
21
+ "fmt"
21
22
"net/http"
22
23
"net/url"
23
24
"testing"
@@ -36,9 +37,17 @@ import (
36
37
37
38
// Trains the MNIST dataset as a RayJob, executed by a Ray cluster
38
39
// directly managed by Kueue, and asserts successful completion of the training job.
39
- func TestMNISTRayJobRayCluster (t * testing.T ) {
40
+
41
+ func TestMnistRayJobRayClusterCpu (t * testing.T ) {
42
+ runMnistRayJobRayCluster (t , "cpu" , 0 )
43
+ }
44
+
45
+ func TestMnistRayJobRayClusterGpu (t * testing.T ) {
46
+ runMnistRayJobRayCluster (t , "gpu" , 1 )
47
+ }
48
+
49
+ func runMnistRayJobRayCluster (t * testing.T , accelerator string , numberOfGpus int ) {
40
50
test := With (t )
41
- test .T ().Parallel ()
42
51
43
52
// Create a namespace and localqueue in that namespace
44
53
namespace := test .NewTestNamespace ()
@@ -51,7 +60,7 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
51
60
test .T ().Logf ("Created ConfigMap %s/%s successfully" , mnist .Namespace , mnist .Name )
52
61
53
62
// Create RayCluster and assign it to the localqueue
54
- rayCluster := constructRayCluster (test , namespace , mnist )
63
+ rayCluster := constructRayCluster (test , namespace , mnist , numberOfGpus )
55
64
AssignToLocalQueue (rayCluster , localQueue )
56
65
rayCluster , err = test .Client ().Ray ().RayV1 ().RayClusters (namespace .Name ).Create (test .Ctx (), rayCluster , metav1.CreateOptions {})
57
66
test .Expect (err ).NotTo (HaveOccurred ())
@@ -62,7 +71,7 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
62
71
Should (WithTransform (RayClusterState , Equal (rayv1 .Ready )))
63
72
64
73
// Create RayJob
65
- rayJob := constructRayJob (test , namespace , rayCluster )
74
+ rayJob := constructRayJob (test , namespace , rayCluster , accelerator , numberOfGpus )
66
75
rayJob , err = test .Client ().Ray ().RayV1 ().RayJobs (namespace .Name ).Create (test .Ctx (), rayJob , metav1.CreateOptions {})
67
76
test .Expect (err ).NotTo (HaveOccurred ())
68
77
test .T ().Logf ("Created RayJob %s/%s successfully" , rayJob .Namespace , rayJob .Name )
@@ -88,10 +97,17 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
88
97
To (WithTransform (RayJobStatus , Equal (rayv1 .JobStatusSucceeded )))
89
98
}
90
99
100
+ func TestMnistRayJobRayClusterAppWrapperCpu (t * testing.T ) {
101
+ runMnistRayJobRayClusterAppWrapper (t , "cpu" , 0 )
102
+ }
103
+
104
+ func TestMnistRayJobRayClusterAppWrapperGpu (t * testing.T ) {
105
+ runMnistRayJobRayClusterAppWrapper (t , "gpu" , 1 )
106
+ }
107
+
91
108
// Same as TestMNISTRayJobRayCluster, except the RayCluster is wrapped in an AppWrapper
92
- func TestMNISTRayJobRayClusterAppWrapper (t * testing.T ) {
109
+ func runMnistRayJobRayClusterAppWrapper (t * testing.T , accelerator string , numberOfGpus int ) {
93
110
test := With (t )
94
- test .T ().Parallel ()
95
111
96
112
// Create a namespace and localqueue in that namespace
97
113
namespace := test .NewTestNamespace ()
@@ -104,7 +120,7 @@ func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) {
104
120
test .T ().Logf ("Created ConfigMap %s/%s successfully" , mnist .Namespace , mnist .Name )
105
121
106
122
// Create RayCluster, wrap in AppWrapper and assign to localqueue
107
- rayCluster := constructRayCluster (test , namespace , mnist )
123
+ rayCluster := constructRayCluster (test , namespace , mnist , numberOfGpus )
108
124
aw := & mcadv1beta2.AppWrapper {
109
125
TypeMeta : metav1.TypeMeta {
110
126
APIVersion : mcadv1beta2 .GroupVersion .String (),
@@ -140,7 +156,7 @@ func TestMNISTRayJobRayClusterAppWrapper(t *testing.T) {
140
156
Should (WithTransform (RayClusterState , Equal (rayv1 .Ready )))
141
157
142
158
// Create RayJob
143
- rayJob := constructRayJob (test , namespace , rayCluster )
159
+ rayJob := constructRayJob (test , namespace , rayCluster , accelerator , numberOfGpus )
144
160
rayJob , err = test .Client ().Ray ().RayV1 ().RayJobs (namespace .Name ).Create (test .Ctx (), rayJob , metav1.CreateOptions {})
145
161
test .Expect (err ).NotTo (HaveOccurred ())
146
162
test .T ().Logf ("Created RayJob %s/%s successfully" , rayJob .Namespace , rayJob .Name )
@@ -183,7 +199,7 @@ func constructMNISTConfigMap(test Test, namespace *corev1.Namespace) *corev1.Con
183
199
}
184
200
}
185
201
186
- func constructRayCluster (_ Test , namespace * corev1.Namespace , mnist * corev1.ConfigMap ) * rayv1.RayCluster {
202
+ func constructRayCluster (_ Test , namespace * corev1.Namespace , mnist * corev1.ConfigMap , numberOfGpus int ) * rayv1.RayCluster {
187
203
return & rayv1.RayCluster {
188
204
TypeMeta : metav1.TypeMeta {
189
205
APIVersion : rayv1 .GroupVersion .String (),
@@ -236,24 +252,6 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
236
252
corev1 .ResourceMemory : resource .MustParse ("2G" ),
237
253
},
238
254
},
239
- VolumeMounts : []corev1.VolumeMount {
240
- {
241
- Name : "mnist" ,
242
- MountPath : "/home/ray/jobs" ,
243
- },
244
- },
245
- },
246
- },
247
- Volumes : []corev1.Volume {
248
- {
249
- Name : "mnist" ,
250
- VolumeSource : corev1.VolumeSource {
251
- ConfigMap : & corev1.ConfigMapVolumeSource {
252
- LocalObjectReference : corev1.LocalObjectReference {
253
- Name : mnist .Name ,
254
- },
255
- },
256
- },
257
255
},
258
256
},
259
257
},
@@ -282,11 +280,31 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
282
280
Resources : corev1.ResourceRequirements {
283
281
Requests : corev1.ResourceList {
284
282
corev1 .ResourceCPU : resource .MustParse ("250m" ),
285
- corev1 .ResourceMemory : resource .MustParse ("256Mi" ),
283
+ corev1 .ResourceMemory : resource .MustParse ("1G" ),
284
+ "nvidia.com/gpu" : resource .MustParse (fmt .Sprint (numberOfGpus )),
286
285
},
287
286
Limits : corev1.ResourceList {
288
- corev1 .ResourceCPU : resource .MustParse ("1" ),
289
- corev1 .ResourceMemory : resource .MustParse ("2G" ),
287
+ corev1 .ResourceCPU : resource .MustParse ("2" ),
288
+ corev1 .ResourceMemory : resource .MustParse ("4G" ),
289
+ "nvidia.com/gpu" : resource .MustParse (fmt .Sprint (numberOfGpus )),
290
+ },
291
+ },
292
+ VolumeMounts : []corev1.VolumeMount {
293
+ {
294
+ Name : "mnist" ,
295
+ MountPath : "/home/ray/jobs" ,
296
+ },
297
+ },
298
+ },
299
+ },
300
+ Volumes : []corev1.Volume {
301
+ {
302
+ Name : "mnist" ,
303
+ VolumeSource : corev1.VolumeSource {
304
+ ConfigMap : & corev1.ConfigMapVolumeSource {
305
+ LocalObjectReference : corev1.LocalObjectReference {
306
+ Name : mnist .Name ,
307
+ },
290
308
},
291
309
},
292
310
},
@@ -299,7 +317,7 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
299
317
}
300
318
}
301
319
302
- func constructRayJob (_ Test , namespace * corev1.Namespace , rayCluster * rayv1.RayCluster ) * rayv1.RayJob {
320
+ func constructRayJob (_ Test , namespace * corev1.Namespace , rayCluster * rayv1.RayCluster , accelerator string , numberOfGpus int ) * rayv1.RayJob {
303
321
return & rayv1.RayJob {
304
322
TypeMeta : metav1.TypeMeta {
305
323
APIVersion : rayv1 .GroupVersion .String (),
@@ -320,6 +338,7 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
320
338
MNIST_DATASET_URL: "` + GetMnistDatasetURL () + `"
321
339
PIP_INDEX_URL: "` + GetPipIndexURL () + `"
322
340
PIP_TRUSTED_HOST: "` + GetPipTrustedHost () + `"
341
+ ACCELERATOR: "` + accelerator + `"
323
342
` ,
324
343
ClusterSelector : map [string ]string {
325
344
RayJobDefaultClusterSelectorKey : rayCluster .Name ,
@@ -336,6 +355,9 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC
336
355
},
337
356
},
338
357
},
358
+ EntrypointNumCpus : 2 ,
359
+ // Using EntrypointNumGpus doesn't seem to work properly on KinD cluster with GPU, EntrypointNumCpus seems reliable
360
+ EntrypointNumGpus : float32 (numberOfGpus ),
339
361
},
340
362
}
341
363
}
0 commit comments