aws
diff --git a/‎.gitignore
Lines changed: 2 additions & 1 deletion b/‎.gitignore
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/sagemaker/modules/configs.py
Lines changed: 104 additions & 6 deletions b/‎src/sagemaker/modules/configs.py
Lines changed: 104 additions & 6 deletions
diff --git a/‎src/sagemaker/modules/constants.py
Lines changed: 12 additions & 6 deletions b/‎src/sagemaker/modules/constants.py
Lines changed: 12 additions & 6 deletions
diff --git a/‎src/sagemaker/modules/templates.py
Lines changed: 56 additions & 18 deletions b/‎src/sagemaker/modules/templates.py
Lines changed: 56 additions & 18 deletions
@@ -32,7 +32,8 @@ env/
 .python-version
 *.html
 **/_repack_script_launcher.sh
-src/sagemaker/modules/scripts/train.sh
+src/sagemaker/modules/train/container_drivers/sm_train.sh
+src/sagemaker/modules/train/container_drivers/sourcecodeconfig.json
 tests/data/**/_repack_model.py
 tests/data/experiment/sagemaker-dev-1.0.tar.gz
 src/sagemaker/serve/tmp_workspace
@@ -21,8 +21,8 @@
 
 from __future__ import absolute_import
 
-from typing import Optional
-from pydantic import BaseModel
+from typing import Optional, Dict, Any, List
+from pydantic import BaseModel, model_validator
 
 from sagemaker_core.shapes import (
     ResourceConfig,
@@ -36,6 +36,8 @@
     VpcConfig,
 )
 
+from sagemaker.modules import logger
+
 __all__ = [
     "SourceCodeConfig",
     "ResourceConfig",
@@ -50,16 +52,104 @@
 ]
 
 
+class SMDistributedSettings(BaseModel):
+    """SMDistributedSettings.
+
+    The SMDistributedSettings is used to configure distributed training when
+        using the smdistributed library.
+
+    Attributes:
+        enable_dataparallel (Optional[bool]):
+            Whether to enable data parallelism.
+        enable_modelparallel (Optional[bool]):
+            Whether to enable model parallelism.
+        modelparallel_parameters (Optional[Dict[str, Any]]):
+            The parameters for model parallelism.
+    """
+
+    enable_dataparallel: Optional[bool] = False
+    enable_modelparallel: Optional[bool] = False
+    modelparallel_parameters: Optional[Dict[str, Any]] = None
+
+
+class DistributionConfig(BaseModel):
+    """Base class for distribution configurations."""
+
+    _distribution_type: str
+
+
+class TorchDistributionConfig(DistributionConfig):
+    """TorchDistributionConfig.
+
+    The TorchDistributionConfig uses `torchrun` or `torch.distributed.launch` in the backend to
+    launch distributed training.
+
+    SMDistributed Library Information:
+        - `TorchDistributionConfig` can be used for SMModelParallel V2.
+        - For SMDataParallel or SMModelParallel V1, it is recommended to use the
+            `MPIDistributionConfig.`
+
+
+    Attributes:
+        smdistributed_settings (Optional[SMDistributedSettings]):
+            The settings for smdistributed library.
+        process_count_per_node (int):
+            The number of processes to run on each node in the training job.
+            Will default to the number of CPUs or GPUs available in the container.
+    """
+
+    _distribution_type: str = "torch_distributed"
+
+    smdistributed_settings: Optional[SMDistributedSettings] = None
+    process_count_per_node: Optional[int] = None
+
+    @model_validator(mode="after")
+    def _validate_model(cls, model):  # pylint: disable=E0213
+        """Validate the model."""
+        if (
+            getattr(model, "smddistributed_settings", None)
+            and model.smddistributed_settings.enable_dataparallel
+        ):
+            logger.warning(
+                "For smdistributed data parallelism, it is recommended to use "
+                + "MPIDistributionConfig."
+            )
+        return model
+
+
+class MPIDistributionConfig(DistributionConfig):
+    """MPIDistributionConfig.
+
+    The MPIDistributionConfig uses `mpirun` in the backend to launch distributed training.
+
+    SMDistributed Library Information:
+        - `MPIDistributionConfig` can be used for SMDataParallel and SMModelParallel V1.
+        - For SMModelParallel V2, it is recommended to use the `TorchDistributionConfig`.
+
+    Attributes:
+        smdistributed_settings (Optional[SMDistributedSettings]):
+            The settings for smdistributed library.
+        process_count_per_node (int):
+            The number of processes to run on each node in the training job.
+            Will default to the number of CPUs or GPUs available in the container.
+        mpi_additional_options (Optional[str]):
+            The custom MPI options to use for the training job.
+    """
+
+    _distribution_type: str = "mpi"
+
+    smdistributed_settings: Optional[SMDistributedSettings] = None
+    process_count_per_node: Optional[int] = None
+    mpi_additional_options: Optional[List[str]] = None
+
+
 class SourceCodeConfig(BaseModel):
     """SourceCodeConfig.
 
     This config allows the user to specify the source code location, dependencies,
     entry script, or commands to be executed in the training job container.
 
     Attributes:
-        command (Optional[str]):
-            The command(s) to execute in the training job container. Example: "python my_script.py".
-            If not specified, entry_script must be provided
         source_dir (Optional[str]):
             The local directory containing the source code to be used in the training job container.
         requirements (Optional[str]):
@@ -68,9 +158,17 @@ class SourceCodeConfig(BaseModel):
         entry_script (Optional[str]):
             The path within `source_dir` to the entry script that will be executed in the training
             job container. If not specified, command must be provided.
+        command (Optional[str]):
+            The command(s) to execute in the training job container. Example: "python my_script.py".
+            If not specified, entry_script must be provided.
+        distribution (Optional[Union[
+            MPIDistributionConfig,
+            TorchDistributionConfig,
+        ]]):
+            The distribution configuration for the training job.
     """
 
-    command: Optional[str] = None
     source_dir: Optional[str] = None
     requirements: Optional[str] = None
     entry_script: Optional[str] = None
+    command: Optional[str] = None
@@ -16,15 +16,21 @@
 
 DEFAULT_INSTANCE_TYPE = "ml.m5.xlarge"
 
-SOURCE_CODE_CONTAINER_PATH = "/opt/ml/input/data/code"
-
+SM_CODE = "sm_code"
 SM_CODE_CONTAINER_PATH = "/opt/ml/input/data/sm_code"
-SM_CODE_LOCAL_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "scripts")
-TRAIN_SCRIPT = "train.sh"
+
+SM_DRIVERS = "sm_drivers"
+SM_DRIVERS_CONTAINER_PATH = "/opt/ml/input/data/sm_drivers"
+SM_DRIVERS_LOCAL_PATH = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "train/container_drivers"
+)
+
+SOURCE_CODE_CONFIG_JSON = "sourcecodeconfig.json"
+TRAIN_SCRIPT = "sm_train.sh"
 
 DEFAULT_CONTAINER_ENTRYPOINT = ["/bin/bash"]
 DEFAULT_CONTAINER_ARGUMENTS = [
     "-c",
-    f"chmod +x {SM_CODE_CONTAINER_PATH}/{TRAIN_SCRIPT} "
-    + f"&& {SM_CODE_CONTAINER_PATH}/{TRAIN_SCRIPT}",
+    f"chmod +x {SM_DRIVERS_CONTAINER_PATH}/{TRAIN_SCRIPT} "
+    + f"&& {SM_DRIVERS_CONTAINER_PATH}/{TRAIN_SCRIPT}",
 ]
@@ -13,10 +13,58 @@
 """Templates module."""
 from __future__ import absolute_import
 
+EXECUTE_BASE_COMMANDS = """
+CMD="{base_command}"
+echo "Running command: $CMD"
+eval $CMD
+"""
+
+EXECUTE_PYTORCH_DRIVER = """
+echo "Running PyTorch training driver"
+$SM_PYTHON_CMD /opt/ml/input/data/sm_drivers/pytorch_driver.py
+"""
+
+EXECUTE_MPI_DRIVER = """
+echo "Running MPI training driver"
+$SM_PYTHON_CMD /opt/ml/input/data/sm_drivers/mpi_driver.py
+"""
+
 TRAIN_SCRIPT_TEMPLATE = """
 #!/bin/bash
+set -e
 echo "Starting training script"
 
+handle_error() {{
+    EXIT_STATUS=$?
+    echo "An error occurred with exit code $EXIT_STATUS"
+    if [ ! -s /opt/ml/output/failure ]; then
+        echo "Training Execution failed. For more details, see CloudWatch logs at 'aws/sagemaker/TrainingJobs'.
+TrainingJob - $TRAINING_JOB_NAME" >> /opt/ml/output/failure
+    fi
+    exit $EXIT_STATUS
+}}
+
+check_python() {{
+    if command -v python3 &>/dev/null; then
+        SM_PYTHON_CMD="python3"
+        SM_PIP_CMD="pip3"
+        echo "Found python3"
+    elif command -v python &>/dev/null; then
+        SM_PYTHON_CMD="python"
+        SM_PIP_CMD="pip"
+        echo "Found python"
+    else
+        echo "Python may not be installed"
+        return 1
+    fi
+}}
+
+trap 'handle_error' ERR
+
+check_python
+
+$SM_PYTHON_CMD --version
+
 echo "/opt/ml/input/config/resourceconfig.json:"
 cat /opt/ml/input/config/resourceconfig.json
 echo
@@ -29,27 +77,17 @@
 cat /opt/ml/input/config/hyperparameters.json
 echo
 
+echo "/opt/ml/input/data/sm_drivers/sourcecodeconfig.json"
+cat /opt/ml/input/data/sm_drivers/sourcecodeconfig.json
+echo
+
 echo "Setting up environment variables"
-python /opt/ml/input/data/sm_code/environment.py
-source /opt/ml/input/data/sm_code/sm_training.env
+$SM_PYTHON_CMD /opt/ml/input/data/sm_drivers/scripts/environment.py
+source /opt/ml/input/data/sm_drivers/scripts/sm_training.env
 
-python --version
 {working_dir}
 {install_requirements}
-CMD="{command}"
-echo "Running command: $CMD"
-eval $CMD
-EXIT_STATUS=$?
+{execute_driver}
 
-if [ $EXIT_STATUS -ne 0 ]; then
-    echo "Command failed with exit status $EXIT_STATUS"
-    if [ ! -s /opt/ml/output/failure ]; then
-        echo "Command failed with exit code $EXIT_STATUS.
-For more details, see CloudWatch logs at 'aws/sagemaker/TrainingJobs'.
-TrainingJob - $TRAINING_JOB_NAME" >> /opt/ml/output/failure
-    fi
-    exit $EXIT_STATUS
-else
-    echo "Command succeeded"
-fi
+echo "Training Container Execution Completed"
 """