aws
diff --git a/‎src/sagemaker/modules/train/model_trainer.py
Lines changed: 55 additions & 14 deletions b/‎src/sagemaker/modules/train/model_trainer.py
Lines changed: 55 additions & 14 deletions
diff --git a/‎src/sagemaker/modules/train/sm_recipes/training_recipes.json
Lines changed: 2 additions & 2 deletions b/‎src/sagemaker/modules/train/sm_recipes/training_recipes.json
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/sagemaker/modules/train/sm_recipes/utils.py
Lines changed: 22 additions & 15 deletions b/‎src/sagemaker/modules/train/sm_recipes/utils.py
Lines changed: 22 additions & 15 deletions
diff --git a/‎tests/unit/sagemaker/modules/train/sm_recipes/__init__.py b/‎tests/unit/sagemaker/modules/train/sm_recipes/__init__.py
@@ -87,7 +87,7 @@
     EXECUTE_BASIC_SCRIPT_DRIVER,
 )
 from sagemaker.modules import logger
-from sagemaker.modules.train.sm_recipes.utils import get_args_from_recipe, _determine_device_type
+from sagemaker.modules.train.sm_recipes.utils import _get_args_from_recipe, _determine_device_type
 
 
 class Mode(Enum):
@@ -154,7 +154,7 @@ class ModelTrainer(BaseModel):
             see: https://docs.aws.amazon.com/sagemaker/latest/dg-ecr-paths/sagemaker-algo-docker-registry-paths
         training_image_config (Optional[TrainingImageConfig]):
             Training image Config. This is the configuration to use an image from a private
-            Docker registry for a traininob.
+            Docker registry for a training job.
         output_data_config (Optional[OutputDataConfig]):
             The output data configuration. This is used to specify the output data location
             for the training job.
@@ -481,10 +481,6 @@ def train(
             )
             self._latest_training_job = training_job
 
-            # Clean up the temporary directory if it exists
-            if self._temp_recipe_train_dir is not None:
-                self._temp_recipe_train_dir.cleanup()
-
             if wait:
                 training_job.wait(logs=logs)
             if logs and not wait:
@@ -816,11 +812,18 @@ def from_recipe(
         training_recipe: str,
         compute: Compute,
         recipe_overrides: Optional[Dict[str, Any]] = None,
+        requirements: Optional[str] = None,
         training_image: Optional[str] = None,
+        training_image_config: Optional[TrainingImageConfig] = None,
+        output_data_config: Optional[OutputDataConfig] = None,
+        input_data_config: Optional[List[Union[Channel, InputData]]] = None,
+        checkpoint_config: Optional[CheckpointConfig] = None,
+        training_input_mode: Optional[str] = "File",
+        environment: Optional[Dict[str, str]] = None,
+        tags: Optional[List[Tag]] = None,
         session: Optional[Session] = None,
         role: Optional[str] = None,
         base_job_name: Optional[str] = None,
-        **kwargs,
     ) -> "ModelTrainer":
         """Create a ModelTrainer from a training recipe.
 
@@ -833,9 +836,33 @@ def from_recipe(
                 the training job. If not specified, will default to 1 instance of ml.m5.xlarge.
             recipe_overrides (Optional[Dict[str, Any]]):
                 The recipe overrides. This is used to override the default recipe parameters.
+            requirements (Optional[str]):
+                The path to a requirements file to install in the training job container.
             training_image (Optional[str]):
                 The training image URI to use for the training job container. If not specified,
                 the training image will be determined from the recipe.
+            training_image_config (Optional[TrainingImageConfig]):
+                Training image Config. This is the configuration to use an image from a private
+                Docker registry for a training job.
+            output_data_config (Optional[OutputDataConfig]):
+                The output data configuration. This is used to specify the output data location
+                for the training job.
+                If not specified, will default to `s3://<default_bucket>/<base_job_name>/output/`.
+            input_data_config (Optional[List[Union[Channel, InputData]]]):
+                The input data config for the training job.
+                Takes a list of Channel or InputData objects. An InputDataSource can be an S3 URI
+                string, local file path string, S3DataSource object, or FileSystemDataSource object.
+            checkpoint_config (Optional[CheckpointConfig]):
+                Contains information about the output location for managed spot training checkpoint
+                data.
+            training_input_mode (Optional[str]):
+                The input mode for the training job. Valid values are "Pipe", "File", "FastFile".
+                Defaults to "File".
+            environment (Optional[Dict[str, str]]):
+                The environment variables for the training job.
+            tags (Optional[List[Tag]]):
+                An array of key-value pairs. You can use tags to categorize your AWS resources
+                in different ways, for example, by purpose, owner, or environment.
             session (Optional[Session]):
                 The SageMaker session.
                 If not specified, a new session will be created.
@@ -846,9 +873,6 @@ def from_recipe(
                 The base name for the training job.
                 If not specified, a default name will be generated using the algorithm name
                 or training image.
-            kwargs:
-                Additional keyword arguments to pass to the ModelTrainer constructor.
-
         """
         if compute.instance_type is None:
             raise ValueError(
@@ -865,20 +889,37 @@ def from_recipe(
             session = Session()
             logger.warning("Session not provided. Using default Session.")
         if role is None:
-            role = get_execution_role()
+            role = get_execution_role(sagemaker_session=session)
             logger.warning(f"Role not provided. Using default role:\n{role}")
 
-        model_trainer_args, recipe_train_dir = get_args_from_recipe(
+        # The training recipe is used to prepare the following args:
+        # - source_code
+        # - training_image
+        # - distributed_runner
+        # - compute
+        # - hyperparameters
+        model_trainer_args, recipe_train_dir = _get_args_from_recipe(
             training_recipe=training_recipe,
             recipe_overrides=recipe_overrides,
+            requirements=requirements,
             compute=compute,
-            session=session,
+            region_name=session.boto_region_name,
         )
         if training_image is not None:
             model_trainer_args["training_image"] = training_image
 
         model_trainer = cls(
-            session=session, role=role, base_job_name=base_job_name, **model_trainer_args, **kwargs
+            session=session,
+            role=role,
+            base_job_name=base_job_name,
+            training_image_config=training_image_config,
+            output_data_config=output_data_config,
+            input_data_config=input_data_config,
+            checkpoint_config=checkpoint_config,
+            training_input_mode=training_input_mode,
+            environment=environment,
+            tags=tags,
+            **model_trainer_args,
         )
 
         model_trainer._temp_recipe_train_dir = recipe_train_dir
 
@@ -1,6 +1,6 @@
 {
-    "adapter_repo": "[email protected]-adapter:benieric/private-sagemaker-hyperpod-training-adapter-for-nemo-staging.git",
-    "launcher_repo": "[email protected]-launcher:benieric/private-sagemaker-hyperpod-recipes-staging.git",
+    "adapter_repo": "[email protected]:aws/private-sagemaker-hyperpod-training-adapter-for-nemo-staging.git",
+    "launcher_repo": "[email protected]:aws/private-sagemaker-hyperpod-recipes-staging.git",
     "neuron_dist_repo": "https://github.com/aws-neuron/neuronx-distributed-training.git",
     "gpu_image" : {
         "framework": "pytorch-smp",
 
@@ -25,7 +25,6 @@
 from omegaconf import OmegaConf, dictconfig
 
 from sagemaker.image_uris import retrieve
-from sagemaker import Session
 
 from sagemaker.modules import logger
 from sagemaker.modules.utils import _run_clone_command_silent
@@ -66,7 +65,7 @@ def _load_recipes_cfg() -> str:
 
 def _load_base_recipe(
     training_recipe: str,
-    recipe_overrides: Optional[Dict[str, Any]],
+    recipe_overrides: Optional[Dict[str, Any]] = None,
     training_recipes_cfg: Optional[Dict[str, Any]] = None,
 ) -> Dict[str, Any]:
     """Load recipe and apply overrides."""
@@ -195,7 +194,6 @@ def _configure_trainium_args(
 
     _run_clone_command_silent(training_recipes_cfg.get("neuron_dist_repo"), recipe_train_dir.name)
 
-    # Set SourceCodeConfig
     source_code.source_dir = os.path.join(recipe_train_dir.name, "examples")
     source_code.entry_script = "training_orchestrator.py"
     neuron_image_cfg = training_recipes_cfg.get("neuron_image")
@@ -220,11 +218,12 @@ def _configure_trainium_args(
     return args
 
 
-def get_args_from_recipe(
+def _get_args_from_recipe(
     training_recipe: str,
     compute: Compute,
-    session: Session,
+    region_name: str,
     recipe_overrides: Optional[Dict[str, Any]],
+    requirements: Optional[str],
 ) -> Tuple[Dict[str, Any], tempfile.TemporaryDirectory]:
     """Get arguments for ModelTrainer from a training recipe.
 
@@ -233,7 +232,7 @@ def get_args_from_recipe(
     {
         "source_code": SourceCode,
         "training_image": str,
-        "distributed_runner": Dict[str, Any],
+        "distributed_runner": DistributedRunner,
         "compute": Compute,
         "hyperparameters": Dict[str, Any],
     }
@@ -244,15 +243,16 @@ def get_args_from_recipe(
             Name of the training recipe or path to the recipe file.
         compute (Compute):
             Compute configuration for training.
-        session (Session):
-            Session object for training.
+        region_name (str):
+            Name of the AWS region.
         recipe_overrides (Optional[Dict[str, Any]]):
             Overrides for the training recipe.
+        requirements (Optional[str]):
+            Path to the requirements file.
     """
     if compute.instance_type is None:
         raise ValueError("Must set `instance_type` in compute when using training recipes.")
 
-    region_name = session.boto_region_name
     training_recipes_cfg = _load_recipes_cfg()
     recipe = _load_base_recipe(training_recipe, recipe_overrides, training_recipes_cfg)
 
@@ -262,18 +262,20 @@ def get_args_from_recipe(
     # Set instance_count
     if compute.instance_count and "num_nodes" in recipe["trainer"]:
         logger.warning(
-            "Using instance_count in compute to set number "
-            " of nodes. Ignoring trainer -> num_nodes in recipe."
+            f"Using Compute to set instance_count:\n{compute}."
+            "\nIgnoring trainer -> num_nodes in recipe."
         )
     if compute.instance_count is None:
         if "num_nodes" not in recipe["trainer"]:
             raise ValueError(
-                "Must set either instance_count argument for estimator or"
-                "set trainer -> num_nodes in recipe."
+                "Must provide Compute with instance_count or" " set trainer -> num_nodes in recipe."
             )
         compute.instance_count = recipe["trainer"]["num_nodes"]
 
-    # Get Training Image, SourceCodeConfig, and DistributionConfig args
+    if requirements and not os.path.isfile(requirements):
+        raise ValueError(f"Recipe requirements file {requirements} not found.")
+
+    # Get Training Image, SourceCode, and DistributedRunner args
     device_type = _determine_device_type(compute.instance_type)
     recipe_train_dir = tempfile.TemporaryDirectory(prefix="training_")
     if device_type == "gpu":
@@ -299,7 +301,12 @@ def get_args_from_recipe(
         config=final_recipe, f=os.path.join(args["source_code"].source_dir, "recipe.yaml")
     )
 
-    # Update args with compute_config and hyperparameters
+    # If recipe_requirements is provided, copy it to source_dir
+    if requirements:
+        shutil.copy(requirements, args["source_code"].source_dir)
+        args["source_code"].requirements = os.path.basename(requirements)
+
+    # Update args with compute and hyperparameters
     args.update(
         {
             "compute": compute,
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`		`- "adapter_repo": "[email protected]-adapter:benieric/private-sagemaker-hyperpod-training-adapter-for-nemo-staging.git",`
`3`		`- "launcher_repo": "[email protected]-launcher:benieric/private-sagemaker-hyperpod-recipes-staging.git",`
	`2`	`+ "adapter_repo": "[email protected]:aws/private-sagemaker-hyperpod-training-adapter-for-nemo-staging.git",`
	`3`	`+ "launcher_repo": "[email protected]:aws/private-sagemaker-hyperpod-recipes-staging.git",`
`4`	`4`	`"neuron_dist_repo": "https://github.com/aws-neuron/neuronx-distributed-training.git",`
`5`	`5`	`"gpu_image" : {`
`6`	`6`	`"framework": "pytorch-smp",`