kubeflow · google-oss-prow · Apr 29, 2025 · Apr 8, 2025 · Apr 8, 2025 · Apr 8, 2025
diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go
@@ -134,6 +134,47 @@ const (
 
 	// TorchEnvMasterPort is the env name for the master node port.
 	TorchEnvMasterPort string = "PET_MASTER_PORT"
+
+	// TochTuneArgNumNodes is the arg anme for the number of training nodes.
+	TorchTuneArgNumNodes string = "--nnodes"
+
+	// TorchTuneArgNumProcPerNode is the arg name for the number of procs per node (e.g. number of GPUs per Pod).
+	TorchTuneArgNumProcPerNode string = "--nproc_per_node"
+
+	// TorchTuneArgRdzvId is the arg name for the rendezvous ID.
+	TorchTuneArgRdzvId string = "--rdzv_id"
+
+	// TorchTuneArgRdzvEndpoint is the arg name for the rendezvous endpoint.
+	TorchTuneArgRdzvEndpoint string = "--rdzv_endpoint"
+
+	// TorchTuneFullFinetuneSingleDevice Recipe is the recipe for the single device full finetune.
+	TorchTuneFullFinetuneSingleDevice string = "full_finetune_single_device"
+
+	// TorchTuneFullFinetuneSingleDeviceConfigSuffix is the config suffix for the single device full finetune.
+	TorchTuneFullFinetuneSingleDeviceConfigSuffix string = "_full_single_device"
+
+	// TorchTuneFullFinetuneDistributed Recipe is the recipe for the distributed full finetune.
+	TorchTuneFullFinetuneDistributed string = "full_finetune_distributed"
+
+	// TorchTuneFullFinetuneMultiDevicesConfigSuffix is the config suffix for the single node distributed full finetune.
+	TorchTuneFullFinetuneMultiDevicesConfigSuffix string = "_full"
+
+	// TorchTuneFullFinetuneMultiNodesConfigSuffix is the config suffix for the multi node distributed full finetune.
+	TorchTuneFullFinetuneMultiNodesConfigSuffix string = "_full_multinode"
+
+	// TorchTuneDefaultRecipe is the default recipe for the torchtune.
+	TorchTuneDefaultRecipe string = TorchTuneFullFinetuneDistributed
+)
+
+const (
+	// MODEL_LLAMA3_2_1B is the model name for the Llama3.2 1B Instruct model.
+	MODEL_LLAMA3_2_1B = "llama3_2/1B"
+
+	// MODEL_LLAMA3_2_7B is the model name for the Llama3.2 7B Instruct model.
+	MODEL_LLAMA3_2_7B = "llama3_2/7B"
+
+	// MODEL_LLAMA3_3_70B is the model name for the Llama3.3 70B Instruct model.
+	MODEL_LLAMA3_3_70B = "llama3_3/70B"
 )
 
 var (
@@ -142,4 +183,13 @@ var (
 
 	// Torchrun reserved env names
 	TorchRunReservedEnvNames = sets.New(TorchEnvNumNodes, TorchEnvNumProcPerNode, TorchEnvNodeRank, TorchEnvMasterAddr, TorchEnvMasterPort)
+
+	// Currently supported TorchTune recipes.
+	TorchTuneSupportedRecipes = sets.New(TorchTuneFullFinetuneSingleDevice, TorchTuneFullFinetuneDistributed)
+
+	// Currently supported pretrained models for TorchTuen Trainer.
+	TorchTuneSupportedPretrainedModels = sets.New(MODEL_LLAMA3_2_1B, MODEL_LLAMA3_2_7B, MODEL_LLAMA3_3_70B)
+
+	// TorchTuneEntrypoint is the entrypoint for the torchtune.
+	TorchTuneEntrypoint = []string{"tune", "run"}
 )
diff --git a/pkg/runtime/framework/plugins/torch/torch.go b/pkg/runtime/framework/plugins/torch/torch.go
@@ -19,6 +19,7 @@ package torch
 import (
 	"context"
 	"fmt"
+	"slices"
 	"strings"
 
 	corev1 "k8s.io/api/core/v1"
@@ -70,16 +71,30 @@ func (t *Torch) Validate(runtimeInfo *runtime.Info, _, newObj *trainer.TrainJob)
 			}
 		}
 
-		torchEnvs := sets.New[string]()
-		for _, env := range newObj.Spec.Trainer.Env {
-			if constants.TorchRunReservedEnvNames.Has(env.Name) {
-				torchEnvs.Insert(env.Name)
+		if !slices.Equal(newObj.Spec.Trainer.Command, constants.TorchTuneEntrypoint) {
+			// Check reserved envs for torchrun.
+			torchEnvs := sets.New[string]()
+			for _, env := range newObj.Spec.Trainer.Env {
+				if constants.TorchRunReservedEnvNames.Has(env.Name) {
+					torchEnvs.Insert(env.Name)
+				}
 			}
-		}
 
-		if torchEnvs.Len() > 0 {
-			trainerEnvsPath := specPath.Child("trainer").Child("env")
-			allErrs = append(allErrs, field.Invalid(trainerEnvsPath, newObj.Spec.Trainer.Env, fmt.Sprintf("must not have reserved envs, invalid envs configured: %v", sets.List(torchEnvs))))
+			if torchEnvs.Len() > 0 {
+				trainerEnvsPath := specPath.Child("trainer").Child("env")
+				allErrs = append(allErrs, field.Invalid(trainerEnvsPath, newObj.Spec.Trainer.Env, fmt.Sprintf("must not have reserved envs, invalid envs configured: %v", sets.List(torchEnvs))))
+			}
+		} else {
+			// Check supported pretrained models for torchtune.
+			// TODO(Electronic-Waste): Add more validation for torchtune when we support more arguments.
+			argPath := specPath.Child("trainer").Child("args")
+			model := getModelFromArgs(newObj.Spec.Trainer.Args)
+
+			if model == nil {
+				allErrs = append(allErrs, field.Invalid(argPath, newObj.Spec.Trainer.Args, "must specify a pretrained model"))
+			} else if !constants.TorchTuneSupportedPretrainedModels.Has(*model) {
+				allErrs = append(allErrs, field.Invalid(argPath, newObj.Spec.Trainer.Args, fmt.Sprintf("must have a supported pretrained model, invalid model configured: %v", *model)))
+			}
 		}
 	}
 
@@ -137,35 +152,75 @@ func (t *Torch) EnforceMLPolicy(info *runtime.Info, trainJob *trainer.TrainJob)
 	}
 
 	// Update envs for Info object.
-	// Add PyTorch distributed "PET_" values for torchrun
-	// TODO (andreyvelich): We should validate that envs from different plugins don't conflict with each other.
-	// Ref: https://github.com/kubeflow/trainer/pull/2308#discussion_r1823229940
 	var trainerContainer *runtime.Container
 	if trainJob.Spec.Trainer != nil {
 		if trainerContainer = info.FindContainerByPodSetAncestorContainerName(constants.AncestorTrainer, constants.Node); trainerContainer != nil {
 			apply.UpsertEnvVars(&trainerContainer.Env, apply.EnvVars(trainJob.Spec.Trainer.Env...)...)
 		}
 	}
 	if trainerContainer != nil {
-		apply.UpsertEnvVar(&trainerContainer.Env,
-			*corev1ac.EnvVar().
-				WithName(constants.TorchEnvNumNodes).
-				WithValue(fmt.Sprintf("%d", ptr.Deref(ptr.Deref(trainerPS, runtime.PodSet{}).Count, 1))),
-			*corev1ac.EnvVar().
-				WithName(constants.TorchEnvNumProcPerNode).
-				WithValue(numProcPerNode.String()),
-			*corev1ac.EnvVar().
-				WithName(constants.TorchEnvNodeRank).
-				WithValueFrom(corev1ac.EnvVarSource().
-					WithFieldRef(corev1ac.ObjectFieldSelector().
-						WithFieldPath(constants.JobCompletionIndexFieldPath))),
-			*corev1ac.EnvVar().
-				WithName(constants.TorchEnvMasterAddr).
-				WithValue(fmt.Sprintf("%s-%s-0-0.%s", trainJob.Name, constants.Node, trainJob.Name)),
-			*corev1ac.EnvVar().
-				WithName(constants.TorchEnvMasterPort).
-				WithValue(fmt.Sprintf("%d", constants.ContainerTrainerPort)),
-		)
+		if !slices.Equal(trainJob.Spec.Trainer.Command, constants.TorchTuneEntrypoint) {
+			// Add PyTorch distributed "PET_" values for torchrun.
+			// TODO (andreyvelich): We should validate that envs from different plugins don't conflict with each other.
+			// Ref: https://github.com/kubeflow/trainer/pull/2308#discussion_r1823229940
+			apply.UpsertEnvVar(&trainerContainer.Env,
+				*corev1ac.EnvVar().
+					WithName(constants.TorchEnvNumNodes).
+					WithValue(fmt.Sprintf("%d", ptr.Deref(ptr.Deref(trainerPS, runtime.PodSet{}).Count, 1))),
+				*corev1ac.EnvVar().
+					WithName(constants.TorchEnvNumProcPerNode).
+					WithValue(numProcPerNode.String()),
+				*corev1ac.EnvVar().
+					WithName(constants.TorchEnvNodeRank).
+					WithValueFrom(corev1ac.EnvVarSource().
+						WithFieldRef(corev1ac.ObjectFieldSelector().
+							WithFieldPath(constants.JobCompletionIndexFieldPath))),
+				*corev1ac.EnvVar().
+					WithName(constants.TorchEnvMasterAddr).
+					WithValue(fmt.Sprintf("%s-%s-0-0.%s", trainJob.Name, constants.Node, trainJob.Name)),
+				*corev1ac.EnvVar().
+					WithName(constants.TorchEnvMasterPort).
+					WithValue(fmt.Sprintf("%d", constants.ContainerTrainerPort)),
+			)
+		} else {
+			// Mutate command line args for torchtune.
+			// Ref: https://github.com/kubeflow/trainer/tree/master/docs/proposals/2401-llm-trainer-v2#complement-torch-plugin
+			oldArgs, newArgs := trainJob.Spec.Trainer.Args, []string{}
+
+			// 1. Add PyTorch distributed command line args for torchtune.
+			// TODO(Electronic-Waste): Add more args for torchtune if required.
+			numNodes := ptr.Deref(ptr.Deref(trainerPS, runtime.PodSet{}).Count, 1)
+			newArgs = append(newArgs,
+				fmt.Sprintf("%s %d",
+					constants.TorchTuneArgNumNodes,
+					numNodes,
+				),
+				fmt.Sprintf("%s %s",
+					constants.TorchTuneArgNumProcPerNode,
+					numProcPerNode.String(),
+				),
+				fmt.Sprintf("%s %s",
+					constants.TorchTuneArgRdzvId,
+					trainJob.Name,
+				),
+				fmt.Sprintf("%s %s-%s-0-0.%s:%d",
+					constants.TorchTuneArgRdzvEndpoint,
+					trainJob.Name, constants.Node, trainJob.Name, constants.ContainerTrainerPort,
+				),
+			)
+
+			// 2. Get the recipe and config from old args and append them to new args.
+			recipe := getRecipeFromArgs(numNodes, numProcPerNode, oldArgs)
+			config := getConfigFileFromArgs(numNodes, recipe, oldArgs)
+			newArgs = append(newArgs, recipe, fmt.Sprintf("--config %s", config))
+
+			// 3. Reserve old arguments to override corresponding items in the config file.
+			newArgs = append(newArgs, slices.DeleteFunc(oldArgs, func(arg string) bool {
+				return strings.HasPrefix(arg, "model")
+			})...)
+
+			trainerContainer.Args = newArgs
+		}
 		// Add container port for the headless service.
 		apply.UpsertPort(&trainerContainer.Ports, *corev1ac.ContainerPort().WithContainerPort(constants.ContainerTrainerPort))
 	}
@@ -188,3 +243,42 @@ func calculateNumProcPerNode(
 	}
 	return intstr.FromInt32(defaultCPU), false
 }
+
+// getRecipeFromArgs extracts the recipe from the distributed parameters and command line arguments.
+// TODO(Electronic-Waste): Add support for more recipes.
+func getRecipeFromArgs(numNodes int32, numProcPerNode intstr.IntOrString, _ []string) string {
+	recipe := constants.TorchTuneDefaultRecipe
+	if numNodes == 1 && numProcPerNode.Type == intstr.Int && numProcPerNode.IntVal == 1 {
+		recipe = constants.TorchTuneFullFinetuneSingleDevice
+	}
+	return recipe
+}
+
+// getConfigFromArgs extracts the config from distributed parameters, recipe and command line arguments.
+func getConfigFileFromArgs(numNodes int32, recipe string, args []string) string {
+	// Determine the config file name based on the recipe and number of nodes.
+	var suffix string
+	switch recipe {
+	case constants.TorchTuneFullFinetuneDistributed:
+		if numNodes == 1 {
+			suffix = constants.TorchTuneFullFinetuneMultiDevicesConfigSuffix
+		} else {
+			suffix = constants.TorchTuneFullFinetuneMultiNodesConfigSuffix
+		}
+	case constants.TorchTuneFullFinetuneSingleDevice:
+		suffix = constants.TorchTuneFullFinetuneSingleDeviceConfigSuffix
+	}
+
+	return fmt.Sprintf("%s%s.yaml", *getModelFromArgs(args), suffix)
+}
+
+func getModelFromArgs(args []string) *string {
+	var model *string
+	for _, arg := range args {
+		if strings.HasPrefix(arg, "model") {
+			model = &strings.Split(arg, "=")[1]
+			break
+		}
+	}
+	return model
+}