aws
diff --git a/‎src/sagemaker/modules/__init__.py
Lines changed: 0 additions & 4 deletions b/‎src/sagemaker/modules/__init__.py
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/sagemaker/modules/configs.py
Lines changed: 2 additions & 2 deletions b/‎src/sagemaker/modules/configs.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/sagemaker/modules/distributed.py
Lines changed: 2 additions & 2 deletions b/‎src/sagemaker/modules/distributed.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/sagemaker/modules/templates.py
Lines changed: 6 additions & 1 deletion b/‎src/sagemaker/modules/templates.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb
Lines changed: 3 additions & 7 deletions b/‎src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb
Lines changed: 3 additions & 7 deletions
diff --git a/‎src/sagemaker/modules/train/container_drivers/basic_script_driver.py
Lines changed: 79 additions & 0 deletions b/‎src/sagemaker/modules/train/container_drivers/basic_script_driver.py
Lines changed: 79 additions & 0 deletions
diff --git a/‎src/sagemaker/modules/train/container_drivers/mpi_driver.py
Lines changed: 7 additions & 1 deletion b/‎src/sagemaker/modules/train/container_drivers/mpi_driver.py
Lines changed: 7 additions & 1 deletion
diff --git a/‎src/sagemaker/modules/train/container_drivers/mpi_utils.py
Lines changed: 10 additions & 5 deletions b/‎src/sagemaker/modules/train/container_drivers/mpi_utils.py
Lines changed: 10 additions & 5 deletions
diff --git a/‎src/sagemaker/modules/train/container_drivers/scripts/environment.py
Lines changed: 7 additions & 5 deletions b/‎src/sagemaker/modules/train/container_drivers/scripts/environment.py
Lines changed: 7 additions & 5 deletions
diff --git a/‎src/sagemaker/modules/train/container_drivers/torchrun_driver.py
Lines changed: 10 additions & 3 deletions b/‎src/sagemaker/modules/train/container_drivers/torchrun_driver.py
Lines changed: 10 additions & 3 deletions
@@ -16,7 +16,3 @@
 from sagemaker_core.main.utils import logger as sagemaker_core_logger
 
 logger = sagemaker_core_logger
-
-from sagemaker.modules.train.model_trainer import (  # noqa: F401 E402 # pylint: disable=C0413
-    ModelTrainer,
-)
@@ -123,9 +123,9 @@ class Compute(shapes.ResourceConfig):
     Attributes:
         instance_type (Optional[str]):
             The ML compute instance type. For information about available instance types,
-            see https://aws.amazon.com/sagemaker/pricing/. Default: ml.m5.xlarge
+            see https://aws.amazon.com/sagemaker/pricing/.
         instance_count (Optional[int]): The number of ML compute instances to use. For distributed
-            training, provide a value greater than 1. Default: 1
+            training, provide a value greater than 1.
         volume_size_in_gb (Optional[int]):
             The size of the ML storage volume that you want to provision.  ML storage volumes store
             model artifacts and incremental states. Training algorithms might also use the ML
 
@@ -30,9 +30,9 @@ def model_dump(self, *args, **kwargs):
 
 
 class Torchrun(DistributedRunner):
-    """TorchDistribution.
+    """TorchDistributed.
 
-    The TorchDistribution runner uses `torchrun` or `torch.distributed.launch` in the backend to
+    The Torchrun distributed runner uses `torchrun` or `torch.distributed.launch` in the backend to
     launch distributed training.
 
     Attributes:
 
@@ -15,10 +15,15 @@
 
 EXECUTE_BASE_COMMANDS = """
 CMD="{base_command}"
-echo "Running command: $CMD"
+echo "Executing command: $CMD"
 eval $CMD
 """
 
+EXECUTE_BASIC_SCRIPT_DRIVER = """
+echo "Running Basic Script driver"
+$SM_PYTHON_CMD /opt/ml/input/data/sm_drivers/basic_script_driver.py
+"""
+
 EXEUCTE_TORCHRUN_DRIVER = """
 echo "Running Torchrun driver"
 $SM_PYTHON_CMD /opt/ml/input/data/sm_drivers/torchrun_driver.py
 
@@ -117,8 +117,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from sagemaker.modules.train import ModelTrainer\n",
     "from sagemaker.modules.configs import SourceCode\n",
     "\n",
+    "pytorch_image = \"763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py310\"\n",
+    "\n",
     "source_code = SourceCode(\n",
     "    source_dir=\"basic-script-mode\",\n",
     "    requirements=\"requirements.txt\",\n",
@@ -460,13 +463,6 @@
     ")\n",
     "model_trainer.train(input_data_config=[test_data], wait=False)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
 
@@ -0,0 +1,79 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""This module is the entry point for the Basic Script Driver."""
+from __future__ import absolute_import
+
+import sys
+import shlex
+
+from typing import List
+
+from utils import (
+    logger,
+    get_python_executable,
+    read_source_code_json,
+    read_hyperparameters_json,
+    execute_commands,
+    write_failure_file,
+    hyperparameters_to_cli_args,
+)
+
+
+def create_commands() -> List[str]:
+    """Create the commands to execute."""
+    source_code = read_source_code_json()
+    hyperparameters = read_hyperparameters_json()
+    python_executable = get_python_executable()
+
+    entry_script = source_code["entry_script"]
+    args = hyperparameters_to_cli_args(hyperparameters)
+    if entry_script.endswith(".py"):
+        commands = [python_executable, entry_script]
+        commands += args
+    elif entry_script.endswith(".sh"):
+        args_str = " ".join(shlex.quote(arg) for arg in args)
+        commands = [
+            "/bin/sh",
+            "-c",
+            f"chmod +x {entry_script} && ./{entry_script} {args_str}",
+        ]
+    else:
+        raise ValueError(
+            f"Unsupported entry script type: {entry_script}. Only .py and .sh are supported."
+        )
+    return commands
+
+
+def main():
+    """Main function for the Basic Script Driver.
+
+    This function is the entry point for the Basic Script Driver.
+
+    Execution Lifecycle:
+    1. Read the source code and hyperparameters JSON files.
+    2. Set hyperparameters as command line arguments.
+    3. Create the commands to execute.
+    4. Execute the commands.
+    """
+
+    cmd = create_commands()
+
+    logger.info(f"Executing command: {' '.join(cmd)}")
+    exit_code, traceback = execute_commands(cmd)
+    if exit_code != 0:
+        write_failure_file(traceback)
+        sys.exit(exit_code)
+
+
+if __name__ == "__main__":
+    main()
@@ -21,6 +21,8 @@
     logger,
     read_source_code_json,
     read_distributed_runner_json,
+    read_hyperparameters_json,
+    hyperparameters_to_cli_args,
     get_process_count,
     execute_commands,
     write_failure_file,
@@ -58,6 +60,7 @@ def main():
     """
     source_code = read_source_code_json()
     distribution = read_distributed_runner_json()
+    hyperparameters = read_hyperparameters_json()
 
     sm_current_host = os.environ["SM_CURRENT_HOST"]
     sm_hosts = json.loads(os.environ["SM_HOSTS"])
@@ -87,7 +90,10 @@ def main():
             entry_script_path=os.path.join(USER_CODE_PATH, source_code["entry_script"]),
         )
 
-        logger.info(f"Executing command: {mpi_command}")
+        args = hyperparameters_to_cli_args(hyperparameters)
+        mpi_command += args
+
+        logger.info(f"Executing command: {' '.join(mpi_command)}")
         exit_code, error_traceback = execute_commands(mpi_command)
         write_status_file_to_workers(worker_hosts)
 
 
@@ -22,13 +22,14 @@
 from utils import logger, SM_EFA_NCCL_INSTANCES, SM_EFA_RDMA_INSTANCES, get_python_executable
 
 FINISHED_STATUS_FILE = "/tmp/done.algo-1"
+READY_FILE = "/tmp/ready.%s"
 DEFAULT_SSH_PORT = 22
 
 
-def _write_status_file(host: str, status_file: str) -> bool:
-    """Write the status file to the provided host."""
+def _write_file_to_host(host: str, status_file: str) -> bool:
+    """Write the a file to the provided host."""
     try:
-        logger.info("Writing finished status file (%s) to %s", status_file, host)
+        logger.info(f"Writing {status_file} to {host}")
         subprocess.run(
             ["ssh", host, "touch", f"{status_file}"],
             capture_output=True,
@@ -46,7 +47,7 @@ def write_status_file_to_workers(worker_hosts: List[str], status_file: str = FIN
     """Write the status file to all worker nodes."""
     for worker in worker_hosts:
         retry = 0
-        while not _write_status_file(worker, status_file):
+        while not _write_file_to_host(worker, status_file):
             time.sleep(5)
             retry += 1
             if retry > 5:
@@ -102,7 +103,10 @@ def _wait_for_workers(worker_hosts: List[str], port: int = DEFAULT_SSH_PORT, tim
 
     while True:
         logger.info("Master is attempting to connect to all workers...")
-        all_workers_connected = all(_can_connect(worker, port) for worker in worker_hosts)
+        all_workers_connected = all(
+            _can_connect(worker, port) and os.path.exists(READY_FILE % worker)
+            for worker in worker_hosts
+        )
 
         if all_workers_connected:
             logger.info("Master can connect to all worker nodes.")
@@ -131,6 +135,7 @@ def bootstrap_worker_node(master_host: str, status_file: str = FINISHED_STATUS_F
     """Bootstrap the worker nodes."""
     logger.info("Bootstrapping worker node...")
     _wait_for_master(master_host)
+    _write_file_to_host(master_host, READY_FILE % os.environ["SM_CURRENT_HOST"])
     _wait_for_status_file(status_file)
 
 
 
@@ -21,6 +21,11 @@
 import sys
 import logging
 
+parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.insert(0, parent_dir)
+
+from utils import safe_serialize  # noqa: E402 # pylint: disable=C0413
+
 # Initialize logger
 SM_LOG_LEVEL = os.environ.get("SM_LOG_LEVEL", 20)
 logger = logging.getLogger(__name__)
@@ -147,7 +152,7 @@ def set_env(
     # Hyperparameters
     env_vars["SM_HPS"] = hyperparameters_config
     for key, value in hyperparameters_config.items():
-        env_vars[f"SM_HP_{key.upper()}"] = value
+        env_vars[f"SM_HP_{key.upper()}"] = safe_serialize(value)
 
     # Host Variables
     current_host = resource_config["current_host"]
@@ -197,10 +202,7 @@ def set_env(
     }
     with open(output_file, "w") as f:
         for key, value in env_vars.items():
-            if isinstance(value, (list, dict)):
-                f.write(f"export {key}='{json.dumps(value)}'\n")
-            else:
-                f.write(f"export {key}='{value}'\n")
+            f.write(f"export {key}='{safe_serialize(value)}'\n")
 
     logger.info("Environment Variables:")
     log_env_variables(env_vars_dict=env_vars)
 
@@ -22,13 +22,15 @@
     logger,
     read_source_code_json,
     read_distributed_runner_json,
+    read_hyperparameters_json,
+    hyperparameters_to_cli_args,
     get_process_count,
     get_python_executable,
-    SM_EFA_NCCL_INSTANCES,
-    SM_EFA_RDMA_INSTANCES,
     execute_commands,
     write_failure_file,
     USER_CODE_PATH,
+    SM_EFA_NCCL_INSTANCES,
+    SM_EFA_RDMA_INSTANCES,
 )
 
 
@@ -65,6 +67,7 @@ def create_commands():
     """Create the Torch Distributed command to execute"""
     source_code = read_source_code_json()
     distribution = read_distributed_runner_json()
+    hyperparameters = read_hyperparameters_json()
 
     process_count = get_process_count(distribution)
     host_count = int(os.environ["SM_HOST_COUNT"])
@@ -92,6 +95,10 @@ def create_commands():
         )
 
     torch_cmd.extend([os.path.join(USER_CODE_PATH, source_code["entry_script"])])
+
+    args = hyperparameters_to_cli_args(hyperparameters)
+    torch_cmd += args
+
     return torch_cmd
 
 
@@ -110,7 +117,7 @@ def main():
     """
     setup_env()
     torch_cmd = create_commands()
-    logger.info(f"Executing command: {torch_cmd}")
+    logger.info(f"Executing command: {' '.join(torch_cmd)}")
     exit_code, traceback = execute_commands(torch_cmd)
     if exit_code != 0:
         write_failure_file(traceback)