aws
diff --git a/‎.gitignore
Lines changed: 2 additions & 2 deletions b/‎.gitignore
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/sagemaker/modules/__init__.py
Lines changed: 4 additions & 0 deletions b/‎src/sagemaker/modules/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/sagemaker/modules/configs.py
Lines changed: 15 additions & 111 deletions b/‎src/sagemaker/modules/configs.py
Lines changed: 15 additions & 111 deletions
diff --git a/‎src/sagemaker/modules/constants.py
Lines changed: 2 additions & 2 deletions b/‎src/sagemaker/modules/constants.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/sagemaker/modules/distributed.py
Lines changed: 126 additions & 0 deletions b/‎src/sagemaker/modules/distributed.py
Lines changed: 126 additions & 0 deletions
diff --git a/‎src/sagemaker/modules/templates.py
Lines changed: 8 additions & 8 deletions b/‎src/sagemaker/modules/templates.py
Lines changed: 8 additions & 8 deletions
@@ -33,8 +33,8 @@ env/
 *.html
 **/_repack_script_launcher.sh
 src/sagemaker/modules/train/container_drivers/sm_train.sh
-src/sagemaker/modules/train/container_drivers/sourcecodeconfig.json
-src/sagemaker/modules/train/container_drivers/distribution.json
+src/sagemaker/modules/train/container_drivers/sourcecode.json
+src/sagemaker/modules/train/container_drivers/distributed_runner.json
 tests/data/**/_repack_model.py
 tests/data/experiment/sagemaker-dev-1.0.tar.gz
 src/sagemaker/serve/tmp_workspace
@@ -16,3 +16,7 @@
 from sagemaker_core.main.utils import logger as sagemaker_core_logger
 
 logger = sagemaker_core_logger
+
+from sagemaker.modules.train.model_trainer import (  # noqa: F401 E402 # pylint: disable=C0413
+    ModelTrainer,
+)
@@ -21,7 +21,7 @@
 
 from __future__ import absolute_import
 
-from typing import Optional, Union, Dict, Any, List
+from typing import Optional, Union
 from pydantic import BaseModel, model_validator
 
 import sagemaker_core.shapes as shapes
@@ -54,15 +54,10 @@
     CheckpointConfig,
 )
 
-from sagemaker.modules import logger
 from sagemaker.modules.utils import convert_unassigned_to_none
 
 __all__ = [
-    "SourceCodeConfig",
-    "TorchDistributionConfig",
-    "MPIDistributionConfig",
-    "SMDistributedSettings",
-    "DistributionConfig",
+    "SourceCode",
     "StoppingCondition",
     "RetryStrategy",
     "OutputDataConfig",
@@ -87,107 +82,16 @@
     "InstanceGroup",
     "TensorBoardOutputConfig",
     "CheckpointConfig",
-    "ComputeConfig",
-    "NetworkingConfig",
+    "Compute",
+    "Networking",
     "InputData",
 ]
 
 
-class SMDistributedSettings(BaseModel):
-    """SMDistributedSettings.
+class SourceCode(BaseModel):
+    """SourceCode.
 
-    The SMDistributedSettings is used to configure distributed training when
-        using the smdistributed library.
-
-    Attributes:
-        enable_dataparallel (Optional[bool]):
-            Whether to enable data parallelism.
-        enable_modelparallel (Optional[bool]):
-            Whether to enable model parallelism.
-        modelparallel_parameters (Optional[Dict[str, Any]]):
-            The parameters for model parallelism.
-    """
-
-    enable_dataparallel: Optional[bool] = False
-    enable_modelparallel: Optional[bool] = False
-    modelparallel_parameters: Optional[Dict[str, Any]] = None
-
-
-class DistributionConfig(BaseModel):
-    """Base class for distribution configurations."""
-
-    _distribution_type: str
-
-
-class TorchDistributionConfig(DistributionConfig):
-    """TorchDistributionConfig.
-
-    The TorchDistributionConfig uses `torchrun` or `torch.distributed.launch` in the backend to
-    launch distributed training.
-
-    SMDistributed Library Information:
-        - `TorchDistributionConfig` can be used for SMModelParallel V2.
-        - For SMDataParallel or SMModelParallel V1, it is recommended to use the
-            `MPIDistributionConfig.`
-
-
-    Attributes:
-        smdistributed_settings (Optional[SMDistributedSettings]):
-            The settings for smdistributed library.
-        process_count_per_node (int):
-            The number of processes to run on each node in the training job.
-            Will default to the number of CPUs or GPUs available in the container.
-    """
-
-    _distribution_type: str = "torch_distributed"
-
-    smdistributed_settings: Optional[SMDistributedSettings] = None
-    process_count_per_node: Optional[int] = None
-
-    @model_validator(mode="after")
-    def _validate_model(cls, model):  # pylint: disable=E0213
-        """Validate the model."""
-        if (
-            getattr(model, "smddistributed_settings", None)
-            and model.smddistributed_settings.enable_dataparallel
-        ):
-            logger.warning(
-                "For smdistributed data parallelism, it is recommended to use "
-                + "MPIDistributionConfig."
-            )
-        return model
-
-
-class MPIDistributionConfig(DistributionConfig):
-    """MPIDistributionConfig.
-
-    The MPIDistributionConfig uses `mpirun` in the backend to launch distributed training.
-
-    SMDistributed Library Information:
-        - `MPIDistributionConfig` can be used for SMDataParallel and SMModelParallel V1.
-        - For SMModelParallel V2, it is recommended to use the `TorchDistributionConfig`.
-
-    Attributes:
-        smdistributed_settings (Optional[SMDistributedSettings]):
-            The settings for smdistributed library.
-        process_count_per_node (int):
-            The number of processes to run on each node in the training job.
-            Will default to the number of CPUs or GPUs available in the container.
-        mpi_additional_options (Optional[str]):
-            The custom MPI options to use for the training job.
-    """
-
-    _distribution_type: str = "mpi"
-
-    smdistributed_settings: Optional[SMDistributedSettings] = None
-    process_count_per_node: Optional[int] = None
-    mpi_additional_options: Optional[List[str]] = None
-
-
-class SourceCodeConfig(BaseModel):
-    """SourceCodeConfig.
-
-    This config allows the user to specify the source code location, dependencies,
+    The SourceCode class allows the user to specify the source code location, dependencies,
     entry script, or commands to be executed in the training job container.
 
     Attributes:
@@ -210,10 +114,10 @@ class SourceCodeConfig(BaseModel):
     command: Optional[str] = None
 
 
-class ComputeConfig(shapes.ResourceConfig):
-    """ComputeConfig.
+class Compute(shapes.ResourceConfig):
+    """Compute.
 
-    The ComputeConfig is a subclass of `sagemaker_core.shapes.ResourceConfig`
+    The Compute class is a subclass of `sagemaker_core.shapes.ResourceConfig`
     and allows the user to specify the compute resources for the training job.
 
     Attributes:
@@ -245,7 +149,7 @@ class ComputeConfig(shapes.ResourceConfig):
     enable_managed_spot_training: Optional[bool] = None
 
     @model_validator(mode="after")
-    def _model_validator(self) -> "ComputeConfig":
+    def _model_validator(self) -> "Compute":
         """Convert Unassigned values to None."""
         return convert_unassigned_to_none(self)
 
@@ -259,10 +163,10 @@ def _to_resource_config(self) -> shapes.ResourceConfig:
         return shapes.ResourceConfig(**filtered_dict)
 
 
-class NetworkingConfig(shapes.VpcConfig):
-    """NetworkingConfig.
+class Networking(shapes.VpcConfig):
+    """Networking.
 
-    The NetworkingConifg is a subclass of `sagemaker_core.shapes.VpcConfig ` and
+    The Networking class is a subclass of `sagemaker_core.shapes.VpcConfig ` and
     allows the user to specify the networking configuration for the training job.
 
     Attributes:
@@ -290,7 +194,7 @@ class NetworkingConfig(shapes.VpcConfig):
     enable_inter_container_traffic_encryption: Optional[bool] = None
 
     @model_validator(mode="after")
-    def _model_validator(self) -> "NetworkingConfig":
+    def _model_validator(self) -> "Networking":
         """Convert Unassigned values to None."""
         return convert_unassigned_to_none(self)
 
 
@@ -25,8 +25,8 @@
     os.path.dirname(os.path.abspath(__file__)), "train/container_drivers"
 )
 
-SOURCE_CODE_CONFIG_JSON = "sourcecodeconfig.json"
-DISTRIBUTION_JSON = "distribution.json"
+SOURCE_CODE_JSON = "sourcecode.json"
+DISTRIBUTED_RUNNER_JSON = "distributed_runner.json"
 TRAIN_SCRIPT = "sm_train.sh"
 
 DEFAULT_CONTAINER_ENTRYPOINT = ["/bin/bash"]
 
@@ -0,0 +1,126 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""Distributed module."""
+from __future__ import absolute_import
+
+from typing import Optional, Dict, Any, List
+from pydantic import BaseModel, PrivateAttr
+
+
+class DistributedRunner(BaseModel):
+    """Base class for DistributedRunner Class"""
+
+    _type: str = PrivateAttr()
+
+    def model_dump(self, *args, **kwargs):
+        """Dump the model to a dictionary."""
+        result = super().model_dump(*args, **kwargs)
+        result["_type"] = self._type
+        return result
+
+
+class Torchrun(DistributedRunner):
+    """TorchDistribution.
+
+    The TorchDistribution runner uses `torchrun` or `torch.distributed.launch` in the backend to
+    launch distributed training.
+
+    Attributes:
+        process_count_per_node (int):
+            The number of processes to run on each node in the training job.
+            Will default to the number of GPUs available in the container.
+    """
+
+    _type: str = PrivateAttr(default="torchrun")
+
+    process_count_per_node: Optional[int] = None
+
+
+class TorchrunSMP(DistributedRunner):
+    """TorchrunSMP.
+
+    The TorchrunSMP runner uses `torchrun` or `torch.distributed.launch` in the backend
+    to launch distributed training. This strategy is used for a PyTorch job using the SageMaker
+    Model Parallelism library v2. For more information on the model parallelism parameters, see:
+    https://docs.aws.amazon.com/sagemaker/latest/dg/distributed-model-parallel-v2-reference.html#distributed-model-parallel-v2-reference-init-config
+
+    Attributes:
+        process_count_per_node (int):
+            The number of processes to run on each node in the training job.
+            Will default to the number of GPUs available in the container.
+        hybrid_shard_degree (Optional[int]):
+            Specifies a sharded parallelism degree for the model.
+        sm_activation_offloading (Optional[bool]):
+            Specifies whether to enable the SMP activation offloading implementation.
+        activation_loading_horizon (Optional[int]):
+            An integer specifying the activation offloading horizon type for FSDP. This is the
+            maximum number of checkpointed or offloaded layers whose inputs can be in the GPU
+            memory simultaneously.
+        fsdp_cache_flush_warnings (Optional[bool]):
+            Detects and warns if cache flushes happen in the PyTorch memory manager, because they
+            can degrade computational performance.
+        allow_empty_shards (Optional[bool]):
+            Whether to allow empty shards when sharding tensors if tensor is not divisible. This is
+            an experimental fix for crash during checkpointing in certain scenarios. Disabling this
+            falls back to the original PyTorch behavior.
+        tensor_parallel_degree (Optional[int]):
+            Specifies a tensor parallelism degree. The value must be between 1 and world_size.
+        context_parallel_degree (Optional[int]):
+            Specifies the context parallelism degree. The value must be between 1 and world_size ,
+            and must be <= hybrid_shard_degree.
+        expert_parallel_degree (Optional[int]):
+            Specifies a expert parallelism degree. The value must be between 1 and world_size.
+        random_seed (Optional[int]):
+            A seed number for the random operations in distributed modules by SMP tensor
+            parallelism or expert parallelism.
+    """
+
+    _type: str = PrivateAttr(default="torchrun")
+
+    process_count_per_node: Optional[int] = None
+    hybrid_shard_degree: Optional[int] = None
+    sm_activation_offloading: Optional[bool] = None
+    activation_loading_horizon: Optional[int] = None
+    fsdp_cache_flush_warnings: Optional[bool] = None
+    allow_empty_shards: Optional[bool] = None
+    tensor_parallel_degree: Optional[int] = None
+    context_parallel_degree: Optional[int] = None
+    expert_parallel_degree: Optional[int] = None
+    random_seed: Optional[int] = None
+
+    def _to_mp_parameters_dict(self) -> Dict[str, Any]:
+        """Convert to a dictionary of MP parameters."""
+        mp_parameters = self.model_dump(exclude_none=True)
+        mp_parameters.pop("_type")
+        if mp_parameters.get("process_count_per_node") is not None:
+            mp_parameters.pop("process_count_per_node")
+        return mp_parameters
+
+
+class MPI(DistributedRunner):
+    """MPI.
+
+    The MPI runner uses `mpirun` in the backend to launch distributed training.
+
+    Attributes:
+        process_count_per_node (int):
+            The number of processes to run on each node in the training job.
+            Will default to the number of GPUs available in the container.
+        mpi_additional_options (Optional[str]):
+            The custom MPI options to use for the training job.
+    """
+
+    _type: str = PrivateAttr(default="mpi")
+
+    process_count_per_node: Optional[int] = None
+    mpi_additional_options: Optional[List[str]] = None
@@ -19,13 +19,13 @@
 eval $CMD
 """
 
-EXECUTE_PYTORCH_DRIVER = """
-echo "Running PyTorch training driver"
-$SM_PYTHON_CMD /opt/ml/input/data/sm_drivers/pytorch_driver.py
+EXEUCTE_TORCHRUN_DRIVER = """
+echo "Running Torchrun driver"
+$SM_PYTHON_CMD /opt/ml/input/data/sm_drivers/torchrun_driver.py
 """
 
 EXECUTE_MPI_DRIVER = """
-echo "Running MPI training driver"
+echo "Running MPI driver"
 $SM_PYTHON_CMD /opt/ml/input/data/sm_drivers/mpi_driver.py
 """
 
@@ -73,12 +73,12 @@
 cat /opt/ml/input/config/inputdataconfig.json
 echo
 
-echo "/opt/ml/input/data/sm_drivers/sourcecodeconfig.json"
-cat /opt/ml/input/data/sm_drivers/sourcecodeconfig.json
+echo "/opt/ml/input/data/sm_drivers/sourcecode.json"
+cat /opt/ml/input/data/sm_drivers/sourcecode.json
 echo
 
-echo "/opt/ml/input/data/sm_drivers/distribution.json"
-cat /opt/ml/input/data/sm_drivers/distribution.json
+echo "/opt/ml/input/data/sm_drivers/distributed_runner.json"
+cat /opt/ml/input/data/sm_drivers/distributed_runner.json
 echo
 
 echo "Setting up environment variables"
Original file line number	Diff line number	Diff line change
`@@ -25,8 +25,8 @@`
`25`	`25`	`os.path.dirname(os.path.abspath(__file__)), "train/container_drivers"`
`26`	`26`	`)`
`27`	`27`
`28`		`-SOURCE_CODE_CONFIG_JSON = "sourcecodeconfig.json"`
`29`		`-DISTRIBUTION_JSON = "distribution.json"`
	`28`	`+SOURCE_CODE_JSON = "sourcecode.json"`
	`29`	`+DISTRIBUTED_RUNNER_JSON = "distributed_runner.json"`
`30`	`30`	`TRAIN_SCRIPT = "sm_train.sh"`
`31`	`31`
`32`	`32`	`DEFAULT_CONTAINER_ENTRYPOINT = ["/bin/bash"]`