mahdikhashan
diff --git a/‎examples/deepspeed/text-summarization/T5-Fine-Tuning.ipynb‎
Lines changed: 475 additions & 411 deletions b/‎examples/deepspeed/text-summarization/T5-Fine-Tuning.ipynb‎
Lines changed: 475 additions & 411 deletions
diff --git a/‎examples/mlx/image-classification/mnist.ipynb‎
Lines changed: 346 additions & 398 deletions b/‎examples/mlx/image-classification/mnist.ipynb‎
Lines changed: 346 additions & 398 deletions
diff --git a/‎examples/mlx/language-modeling/fine-tune-llama.ipynb‎
Lines changed: 143 additions & 182 deletions b/‎examples/mlx/language-modeling/fine-tune-llama.ipynb‎
Lines changed: 143 additions & 182 deletions
diff --git a/‎pkg/runtime/framework/core/framework_test.go‎
Lines changed: 834 additions & 2 deletions b/‎pkg/runtime/framework/core/framework_test.go‎
Lines changed: 834 additions & 2 deletions
diff --git a/‎pkg/runtime/framework/plugins/jobset/builder.go‎
Lines changed: 22 additions & 7 deletions b/‎pkg/runtime/framework/plugins/jobset/builder.go‎
Lines changed: 22 additions & 7 deletions
diff --git a/‎pkg/runtime/framework/plugins/mpi/mpi.go‎
Lines changed: 9 additions & 2 deletions b/‎pkg/runtime/framework/plugins/mpi/mpi.go‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎pkg/runtime/framework/plugins/mpi/mpi_test.go‎
Lines changed: 138 additions & 0 deletions b/‎pkg/runtime/framework/plugins/mpi/mpi_test.go‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎pkg/runtime/framework/plugins/torch/torch.go‎
Lines changed: 2 additions & 51 deletions b/‎pkg/runtime/framework/plugins/torch/torch.go‎
Lines changed: 2 additions & 51 deletions
diff --git a/‎pkg/runtime/framework/plugins/torch/torchtune.go‎
Lines changed: 2 additions & 2 deletions b/‎pkg/runtime/framework/plugins/torch/torchtune.go‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pkg/runtime/runtime.go‎
Lines changed: 50 additions & 0 deletions b/‎pkg/runtime/runtime.go‎
Lines changed: 50 additions & 0 deletions
@@ -101,21 +101,26 @@ func (b *Builder) Initializer(trainJob *trainer.TrainJob) *Builder {
 	return b
 }
 
+// isRunLauncherAsNode returns true if runLauncherAsNode is set to true in the MPI policy.
+func (b *Builder) isRunLauncherAsNode(info *runtime.Info) bool {
+	return info.RuntimePolicy.MLPolicySource != nil &&
+		info.RuntimePolicy.MLPolicySource.MPI != nil &&
+		info.RuntimePolicy.MLPolicySource.MPI.RunLauncherAsNode != nil &&
+		*info.RuntimePolicy.MLPolicySource.MPI.RunLauncherAsNode
+}
+
 // Trainer updates JobSet values for the trainer Job.
 func (b *Builder) Trainer(info *runtime.Info, trainJob *trainer.TrainJob) *Builder {
 	for i, rJob := range b.Spec.ReplicatedJobs {
+		ancestor := ""
 		jobMetadata := rJob.Template.ObjectMetaApplyConfiguration
-		if jobMetadata == nil || jobMetadata.Labels == nil {
-			continue
+		if jobMetadata != nil && jobMetadata.Labels != nil {
+			ancestor = jobMetadata.Labels[constants.LabelTrainJobAncestor]
 		}
-		if ancestor, ok := jobMetadata.Labels[constants.LabelTrainJobAncestor]; ok && ancestor == constants.AncestorTrainer {
+		if ancestor == constants.AncestorTrainer {
 			// TODO: Support multiple replicas ('.template.spec.replicatedJobs[*].replicas') for replicated Jobs.
 			// REF: https://github.com/kubeflow/trainer/issues/2318
 			b.Spec.ReplicatedJobs[i].Replicas = ptr.To[int32](1)
-			// Update the Parallelism and Completions values for the Trainer Job.
-			b.Spec.ReplicatedJobs[i].Template.Spec.Parallelism = info.FindPodSetByAncestor(constants.AncestorTrainer).Count
-			b.Spec.ReplicatedJobs[i].Template.Spec.Completions = info.FindPodSetByAncestor(constants.AncestorTrainer).Count
-
 			// Update values for the Trainer container.
 			for j, container := range rJob.Template.Spec.Template.Spec.Containers {
 				if *container.Name == constants.Node {
@@ -130,6 +135,16 @@ func (b *Builder) Trainer(info *runtime.Info, trainJob *trainer.TrainJob) *Build
 						if args := jobTrainer.Args; args != nil {
 							b.Spec.ReplicatedJobs[i].Template.Spec.Template.Spec.Containers[j].Args = args
 						}
+					}
+				}
+			}
+		}
+		if ancestor == constants.AncestorTrainer || b.isRunLauncherAsNode(info) && *rJob.Name == constants.Node {
+			// TODO (andreyvelich): For MPI we should apply container resources to the Node ReplicatedJob also.
+			// Eventually, we should find better way to propagate resources from TrainJob to JobSet.
+			for j, container := range rJob.Template.Spec.Template.Spec.Containers {
+				if *container.Name == constants.Node {
+					if jobTrainer := trainJob.Spec.Trainer; jobTrainer != nil {
 						if resourcesPerNode := jobTrainer.ResourcesPerNode; resourcesPerNode != nil &&
 							(resourcesPerNode.Limits != nil || resourcesPerNode.Requests != nil) {
 							requirements := corev1ac.ResourceRequirements()
 
@@ -113,8 +113,6 @@ func (m *MPI) EnforceMLPolicy(info *runtime.Info, trainJob *trainer.TrainJob) er
 	if trainJob.Spec.Trainer != nil && trainJob.Spec.Trainer.NumNodes != nil {
 		if node := info.FindPodSetByName(constants.Node); node != nil && node.Count != nil {
 			if ptr.Deref(info.RuntimePolicy.MLPolicySource.MPI.RunLauncherAsNode, false) {
-				// TODO: We should implement more strong validations for the MPIRuntime with runLauncherAsNode.
-				// REF: https://github.com/kubeflow/trainer/issues/2550
 				// When runLauncherAsNode is enabled, 1 nodes should be allocated to launcher.
 				*node.Count = max(*trainJob.Spec.Trainer.NumNodes-1, 1)
 			} else {
@@ -125,6 +123,15 @@ func (m *MPI) EnforceMLPolicy(info *runtime.Info, trainJob *trainer.TrainJob) er
 
 	if trainJob.Spec.Trainer != nil && trainJob.Spec.Trainer.NumProcPerNode != nil {
 		info.RuntimePolicy.MLPolicySource.MPI.NumProcPerNode = ptr.To(int32(trainJob.Spec.Trainer.NumProcPerNode.IntValue()))
+		// If numProcPerNode is set to 1 in runtime, we make it equal to number of GPUs.
+	} else if *info.RuntimePolicy.MLPolicySource.MPI.NumProcPerNode == 1 {
+		resourcesPerNode := ptr.Deref(runtime.ExtractResourcePerNodeFromRuntime(info), corev1.ResourceRequirements{})
+		if jobTrainer := trainJob.Spec.Trainer; jobTrainer != nil && jobTrainer.ResourcesPerNode != nil {
+			resourcesPerNode = ptr.Deref(jobTrainer.ResourcesPerNode, corev1.ResourceRequirements{})
+		}
+		if gpuQ := runtime.GetNumGPUPerNode(&resourcesPerNode); gpuQ > 1 {
+			info.RuntimePolicy.MLPolicySource.MPI.NumProcPerNode = ptr.To(int32(gpuQ))
+		}
 	}
 
 	// Add Secret and ConfigMap volumes to the Info object
 
@@ -26,6 +26,7 @@ import (
 	gocmp "github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	apiruntime "k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/util/intstr"
@@ -350,6 +351,143 @@ trainJob-node-1-1.trainJob slots=1
 				utiltesting.MakeConfigMapWrapper(fmt.Sprintf("trainJob%s", constants.MPIHostfileConfigMapSuffix), metav1.NamespaceDefault).
 					WithData(map[string]string{
 						constants.MPIHostfileName: `trainJob-node-1-0.trainJob slots=2
+`,
+					}).
+					ControllerReference(trainer.SchemeGroupVersion.WithKind(trainer.TrainJobKind), "trainJob", "trainJob").
+					Obj(),
+			},
+		},
+		"numProcPerNode is set to number of GPUs in TrainJob": {
+			info: &runtime.Info{
+				Labels:      make(map[string]string),
+				Annotations: make(map[string]string),
+				TemplateSpec: runtime.TemplateSpec{
+					PodSets: []runtime.PodSet{
+						{
+							Name:  constants.Launcher,
+							Count: ptr.To[int32](1),
+							Endpoints: func(yield func(string) bool) {
+								yield("trainJob-launcher-0-0.trainJob")
+							},
+						},
+						{
+							Name:     constants.Node,
+							Ancestor: ptr.To(constants.AncestorTrainer),
+							Count:    ptr.To[int32](1),
+							Endpoints: func(yield func(string) bool) {
+								yield("trainJob-node-1-0.trainJob")
+							},
+						},
+					},
+				},
+				RuntimePolicy: runtime.RuntimePolicy{
+					MLPolicySource: utiltesting.MakeMLPolicySourceWrapper().
+						MPIPolicy(ptr.To[int32](1), trainer.MPIImplementationOpenMPI, ptr.To("/root/.ssh"), nil).
+						Obj(),
+				},
+				Scheduler: &runtime.Scheduler{
+					PodLabels: make(map[string]string),
+				},
+			},
+			trainJob: utiltesting.MakeTrainJobWrapper(metav1.NamespaceDefault, "trainJob").
+				UID("trainJob").
+				Trainer(
+					utiltesting.MakeTrainJobTrainerWrapper().
+						NumNodes(1).
+						Container("test:trainjob", []string{"trainjob"}, []string{"trainjob"}, corev1.ResourceList{
+							"custom.com/gpu": resource.MustParse("5"),
+						}).
+						Obj()).
+				Obj(),
+			wantInfo: &runtime.Info{
+				Labels:      make(map[string]string),
+				Annotations: make(map[string]string),
+				TemplateSpec: runtime.TemplateSpec{
+					PodSets: []runtime.PodSet{
+						{
+							Name:  constants.Launcher,
+							Count: ptr.To[int32](1),
+							Volumes: []corev1ac.VolumeApplyConfiguration{
+								*corev1ac.Volume().
+									WithName(constants.MPISSHAuthVolumeName).
+									WithSecret(corev1ac.SecretVolumeSource().
+										WithSecretName(fmt.Sprintf("trainJob%s", constants.MPISSHAuthSecretSuffix)).
+										WithItems(
+											corev1ac.KeyToPath().
+												WithKey(corev1.SSHAuthPrivateKey).
+												WithPath(constants.MPISSHPrivateKeyFile),
+											corev1ac.KeyToPath().
+												WithKey(constants.MPISSHPublicKey).
+												WithPath(constants.MPISSHPublicKeyFile),
+											corev1ac.KeyToPath().
+												WithKey(constants.MPISSHPublicKey).
+												WithPath(constants.MPISSHAuthorizedKeys),
+										),
+									),
+								*corev1ac.Volume().
+									WithName(constants.MPIHostfileVolumeName).
+									WithConfigMap(corev1ac.ConfigMapVolumeSource().
+										WithName(fmt.Sprintf("trainJob%s", constants.MPIHostfileConfigMapSuffix)).
+										WithItems(
+											corev1ac.KeyToPath().
+												WithKey(constants.MPIHostfileName).
+												WithPath(constants.MPIHostfileName).
+												WithMode(0444),
+										),
+									),
+							},
+							Endpoints: func(yield func(string) bool) {
+								yield("trainJob-launcher-0-0.trainJob")
+							},
+						},
+						{
+							Name:     constants.Node,
+							Ancestor: ptr.To(constants.AncestorTrainer),
+							Count:    ptr.To[int32](1),
+							Volumes: []corev1ac.VolumeApplyConfiguration{
+								*corev1ac.Volume().
+									WithName(constants.MPISSHAuthVolumeName).
+									WithSecret(corev1ac.SecretVolumeSource().
+										WithSecretName(fmt.Sprintf("trainJob%s", constants.MPISSHAuthSecretSuffix)).
+										WithItems(
+											corev1ac.KeyToPath().
+												WithKey(corev1.SSHAuthPrivateKey).
+												WithPath(constants.MPISSHPrivateKeyFile),
+											corev1ac.KeyToPath().
+												WithKey(constants.MPISSHPublicKey).
+												WithPath(constants.MPISSHPublicKeyFile),
+											corev1ac.KeyToPath().
+												WithKey(constants.MPISSHPublicKey).
+												WithPath(constants.MPISSHAuthorizedKeys),
+										),
+									),
+							},
+							Endpoints: func(yield func(string) bool) {
+								yield("trainJob-node-1-0.trainJob")
+							},
+						},
+					},
+				},
+				RuntimePolicy: runtime.RuntimePolicy{
+					MLPolicySource: utiltesting.MakeMLPolicySourceWrapper().
+						MPIPolicy(ptr.To[int32](5), trainer.MPIImplementationOpenMPI, ptr.To("/root/.ssh"), nil).
+						Obj(),
+				},
+				Scheduler: &runtime.Scheduler{PodLabels: make(map[string]string)},
+			},
+			wantObjs: []apiruntime.Object{
+				utiltesting.MakeSecretWrapper(fmt.Sprintf("trainJob%s", constants.MPISSHAuthSecretSuffix), metav1.NamespaceDefault).
+					WithImmutable(true).
+					WithType(corev1.SecretTypeSSHAuth).
+					WithData(map[string][]byte{
+						constants.MPISSHPublicKey: []byte("EXIST"),
+						corev1.SSHAuthPrivateKey:  []byte("EXIST"),
+					}).
+					ControllerReference(trainer.SchemeGroupVersion.WithKind(trainer.TrainJobKind), "trainJob", "trainJob").
+					Obj(),
+				utiltesting.MakeConfigMapWrapper(fmt.Sprintf("trainJob%s", constants.MPIHostfileConfigMapSuffix), metav1.NamespaceDefault).
+					WithData(map[string]string{
+						constants.MPIHostfileName: `trainJob-node-1-0.trainJob slots=5
 `,
 					}).
 					ControllerReference(trainer.SchemeGroupVersion.WithKind(trainer.TrainJobKind), "trainJob", "trainJob").
 
@@ -20,7 +20,6 @@ import (
 	"context"
 	"fmt"
 	"slices"
-	"strings"
 
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/util/intstr"
@@ -30,7 +29,6 @@ import (
 	"k8s.io/utils/ptr"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
-	jobsetv1alpha2ac "sigs.k8s.io/jobset/client-go/applyconfiguration/jobset/v1alpha2"
 
 	trainer "github.com/kubeflow/trainer/v2/pkg/apis/trainer/v1alpha1"
 	"github.com/kubeflow/trainer/v2/pkg/apply"
@@ -113,11 +111,11 @@ func (t *Torch) EnforceMLPolicy(info *runtime.Info, trainJob *trainer.TrainJob)
 	}
 
 	// Determine numProcPerNode based on the resourcesPerNode.
-	resourcesPerNode := ptr.Deref(extractResourcePerNodeFromRuntime(info), corev1.ResourceRequirements{})
+	resourcesPerNode := ptr.Deref(runtime.ExtractResourcePerNodeFromRuntime(info), corev1.ResourceRequirements{})
 	if jobTrainer := trainJob.Spec.Trainer; jobTrainer != nil && jobTrainer.ResourcesPerNode != nil {
 		resourcesPerNode = ptr.Deref(jobTrainer.ResourcesPerNode, corev1.ResourceRequirements{})
 	}
-	gpuQ := getNumGPUPerNode(&resourcesPerNode)
+	gpuQ := runtime.GetNumGPUPerNode(&resourcesPerNode)
 	// If numProcPerNode is "cpu" or no GPU is set in resource, we calculate numProcPerNode based on CPU.
 	if numProcPerNode.String() == "cpu" || numProcPerNode.String() == "auto" && gpuQ == 0 {
 		numProcPerNode = intstr.FromInt(max(1, getNumCPUPerNode(&resourcesPerNode)))
@@ -204,50 +202,3 @@ func getNumCPUPerNode(res *corev1.ResourceRequirements) int {
 	}
 	return int(requestCpuQ.Value())
 }
-
-// getNumGPUPerNode returns the GPU count if found.
-func getNumGPUPerNode(res *corev1.ResourceRequirements) int {
-	if res == nil {
-		return 0
-	}
-	gpuQ := numGPU(res.Requests)
-	if limitGpuQ := numGPU(res.Limits); gpuQ == 0 && limitGpuQ > 0 {
-		gpuQ = limitGpuQ
-	}
-	return gpuQ
-}
-
-func numGPU(resourcePerNode corev1.ResourceList) int {
-	for resName, resQ := range resourcePerNode {
-		if strings.Contains(strings.ToLower(resName.String()), "gpu") {
-			return int(resQ.Value())
-		}
-	}
-	return 0
-}
-
-// extractResourcePerNodeFromRuntime extracts the resource per node from the Trainer Node.
-func extractResourcePerNodeFromRuntime(info *runtime.Info) *corev1.ResourceRequirements {
-	if jobSetSpec, ok := runtime.TemplateSpecApply[jobsetv1alpha2ac.JobSetSpecApplyConfiguration](info); ok {
-		for _, rJob := range jobSetSpec.ReplicatedJobs {
-			if rJob.Name != nil && *rJob.Name == constants.Node || rJob.Template.Labels[constants.LabelTrainJobAncestor] == constants.AncestorTrainer {
-				for _, container := range rJob.Template.Spec.Template.Spec.Containers {
-					if container.Name != nil && *container.Name == constants.Node && container.Resources != nil {
-						res := &corev1.ResourceRequirements{
-							Limits:   corev1.ResourceList{},
-							Requests: corev1.ResourceList{},
-						}
-						if container.Resources.Limits != nil {
-							res.Limits = *container.Resources.Limits
-						}
-						if container.Resources.Requests != nil {
-							res.Requests = *container.Resources.Requests
-						}
-						return res
-					}
-				}
-			}
-		}
-	}
-	return nil
-}
@@ -51,11 +51,11 @@ func validateTorchTune(runtimeInfo *runtime.Info, newObj *trainer.TrainJob) (adm
 
 	numProcPerNodeRefPath := specPath.Child("trainer").Child("numProcPerNode")
 	numProcPerNode := *newObj.Spec.Trainer.NumProcPerNode
-	resourcesPerNode := ptr.Deref(extractResourcePerNodeFromRuntime(runtimeInfo), corev1.ResourceRequirements{})
+	resourcesPerNode := ptr.Deref(runtime.ExtractResourcePerNodeFromRuntime(runtimeInfo), corev1.ResourceRequirements{})
 	if jobTrainer := newObj.Spec.Trainer; jobTrainer != nil && jobTrainer.ResourcesPerNode != nil {
 		resourcesPerNode = ptr.Deref(jobTrainer.ResourcesPerNode, corev1.ResourceRequirements{})
 	}
-	_, config := getRecipeAndConfig(numNodes, numProcPerNode, getNumGPUPerNode(&resourcesPerNode), newObj)
+	_, config := getRecipeAndConfig(numNodes, numProcPerNode, runtime.GetNumGPUPerNode(&resourcesPerNode), newObj)
 	if strings.Contains(config, constants.TorchTuneQLoRAFinetuneDistributedConfigSuffix) {
 		if model == constants.TORCHTUNE_MODEL_QWEN2_5_1_5B {
 			allErrs = append(allErrs, field.Invalid(runtimeRefNamePath, newObj.Spec.RuntimeRef.Name, fmt.Sprintf("QLoRA is not supported for %v model", model)))
 
@@ -20,14 +20,17 @@ import (
 	"iter"
 	"maps"
 	"slices"
+	"strings"
 
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/runtime/schema"
 	corev1ac "k8s.io/client-go/applyconfigurations/core/v1"
 	resourcehelpers "k8s.io/component-helpers/resource"
 	"k8s.io/utils/ptr"
+	jobsetv1alpha2ac "sigs.k8s.io/jobset/client-go/applyconfiguration/jobset/v1alpha2"
 
 	trainer "github.com/kubeflow/trainer/v2/pkg/apis/trainer/v1alpha1"
+	"github.com/kubeflow/trainer/v2/pkg/constants"
 )
 
 var (
@@ -240,3 +243,50 @@ func RuntimeRefToRuntimeRegistryKey(runtimeRef trainer.RuntimeRef) string {
 		Kind:  ptr.Deref(runtimeRef.Kind, ""),
 	}.String()
 }
+
+// ExtractResourcePerNodeFromRuntime extracts the Trainer resource per node from the Info object.
+func ExtractResourcePerNodeFromRuntime(info *Info) *corev1.ResourceRequirements {
+	if jobSetSpec, ok := TemplateSpecApply[jobsetv1alpha2ac.JobSetSpecApplyConfiguration](info); ok {
+		for _, rJob := range jobSetSpec.ReplicatedJobs {
+			if rJob.Name != nil && *rJob.Name == constants.Node || rJob.Template.Labels[constants.LabelTrainJobAncestor] == constants.AncestorTrainer {
+				for _, container := range rJob.Template.Spec.Template.Spec.Containers {
+					if container.Name != nil && *container.Name == constants.Node && container.Resources != nil {
+						res := &corev1.ResourceRequirements{
+							Limits:   corev1.ResourceList{},
+							Requests: corev1.ResourceList{},
+						}
+						if container.Resources.Limits != nil {
+							res.Limits = *container.Resources.Limits
+						}
+						if container.Resources.Requests != nil {
+							res.Requests = *container.Resources.Requests
+						}
+						return res
+					}
+				}
+			}
+		}
+	}
+	return nil
+}
+
+// GetNumGPUPerNode returns the GPU count if found in container resources.
+func GetNumGPUPerNode(res *corev1.ResourceRequirements) int {
+	if res == nil {
+		return 0
+	}
+	gpuQ := numGPU(res.Requests)
+	if limitGpuQ := numGPU(res.Limits); gpuQ == 0 && limitGpuQ > 0 {
+		gpuQ = limitGpuQ
+	}
+	return gpuQ
+}
+
+func numGPU(resourcePerNode corev1.ResourceList) int {
+	for resName, resQ := range resourcePerNode {
+		if strings.Contains(strings.ToLower(resName.String()), "gpu") {
+			return int(resQ.Value())
+		}
+	}
+	return 0
+}