enhancements

RobotSail · RobotSail · commit 1dddebd9149c · 2025-10-14T12:15:55.000-04:00
diff --git a/src/training_hub/algorithms/sft.py b/src/training_hub/algorithms/sft.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict, Type, Optional
 from instructlab.training import run_training, TorchrunArgs, TrainingArgs
 
@@ -213,6 +212,9 @@ def sft(model_path: str,
         node_rank: Rank of this node (0 to nnodes-1) for distributed training
         rdzv_id: Unique job ID for rendezvous in distributed training
         rdzv_endpoint: Master node endpoint for multi-node training
+        master_addr: Master node address for distributed training
+        master_port: Master node port for distributed training
+
         **kwargs: Additional parameters passed to the backend
     
     Returns:
diff --git a/src/training_hub/utils.py b/src/training_hub/utils.py
@@ -31,7 +31,7 @@ def format_type_name(tp):
     return type_str
 
 
-def get_torchrun_params(args: dict):
+def get_torchrun_params(args: dict) -> dict[str, str | int]:
     """
     Parse and load PyTorch distributed training parameters with hierarchical precedence.
     
@@ -104,21 +104,18 @@ def validate_nproc_per_node(value: int | str) -> int | str:
             raise ValueError(f"nproc_per_node must be 'auto', 'gpu', or an integer, got type {type(value).__name__}")
         if isinstance(value, int):
             return value
-        
+
         value_lower = value.lower().strip()
         if value_lower not in ['auto', 'gpu'] and not value_lower.isdigit():
             raise ValueError(f"nproc_per_node must be 'auto', 'gpu', or an integer, got: {value!r}")
         if value_lower.isdigit():
             return int(value_lower)
-        elif value_lower == 'gpu':
-            return 'gpu'
 
-        # otherwise just handle auto logic
-        # convert 'auto' to 'gpu' if CUDA is available
-        if torch.cuda.is_available():
+        # handle 'auto' and 'gpu' - both require CUDA
+        if value_lower in ['auto', 'gpu'] and torch.cuda.is_available():
             return 'gpu'
         else:
-            raise ValueError("nproc_per_node='auto' requires CUDA GPUs, but none are available")
+            raise ValueError(f"nproc_per_node='{value_lower}' requires CUDA GPUs, but none are available")
 
     def get_param_reference(param_name: str, source: str) -> str:
         """Format parameter reference based on source (args vs env)."""
@@ -151,7 +148,13 @@ def get_param_reference(param_name: str, source: str) -> str:
         # we know the final values in this case must be integers, so any non-None value here
         # should be castable to `int`.
         value, _ = get_param_value(param)
-        torchrun_args[param] = int(value) if value is not None else default
+        if value is not None:
+            try:
+                torchrun_args[param] = int(value)
+            except (ValueError, TypeError) as e:
+                raise ValueError(f"Invalid value for {param}: {value!r}. Must be an integer.") from e
+        else:
+            torchrun_args[param] = default
     
     
     # rdzv_id will be either a str or int; we just perform some cleanup before
@@ -212,8 +215,13 @@ def get_param_reference(param_name: str, source: str) -> str:
             # validate env conflicts only when we're actually using master_port
             if master_port_source == 'env':
                 validate_env_conflict('master_port')
-            torchrun_args['master_port'] = int(master_port_val)
+            try:
+                torchrun_args['master_port'] = int(master_port_val)
+            except (ValueError, TypeError) as e:
+                raise ValueError(f"Invalid value for master_port: {master_port_val!r}. Must be an integer.") from e
 
+    # Note: If neither master_addr nor rdzv_endpoint is set, torchrun will use
+    # its default behavior (typically localhost or other configured defaults)
     elif rdzv_endpoint_val:
         torchrun_args['rdzv_endpoint'] = rdzv_endpoint_val