allow OSFT to be configurable

RobotSail · RobotSail · commit 2c7f53b6b105 · 2025-08-24T23:23:37.000Z
diff --git a/examples/README.md b/examples/README.md
@@ -57,7 +57,7 @@ result = osft(
     data_path="/path/to/data.jsonl", 
     output_dir="/path/to/outputs",
     unfreeze_rank_ratio=0.3,
-    batch_size=8,
+    effective_batch_size=8,
     max_tokens_per_gpu=2048,
     max_seq_len=2048,
     learning_rate=2e-5
diff --git a/examples/docs/osft_usage.md b/examples/docs/osft_usage.md
@@ -71,7 +71,7 @@ result = osft(
     data_path="/path/to/your/training/data.jsonl",
     output_dir="/path/to/save/outputs",
     unfreeze_rank_ratio=0.3,
-    batch_size=8,
+    effective_batch_size=16,
     max_tokens_per_gpu=2048,
     max_seq_len=2048,
     learning_rate=2e-5
@@ -83,11 +83,11 @@ result = osft(
     data_path="/path/to/your/training/data.jsonl",
     output_dir="/path/to/save/outputs",
     unfreeze_rank_ratio=0.2,
-    batch_size=4,
+    effective_batch_size=16,
     max_tokens_per_gpu=4096,
     max_seq_len=4096,
     learning_rate=1e-5,
-    epochs=3,
+    num_epochs=3,
     warmup_steps=100,
     use_liger=True,
     seed=42
@@ -114,7 +114,7 @@ result = osft_algo.train(
     max_tokens_per_gpu=3072,
     max_seq_len=2048,
     learning_rate=1.5e-5,
-    epochs=2
+    num_epochs=2
 )
 
 # Check required parameters
@@ -149,7 +149,7 @@ OSFTAlgorithm = AlgorithmRegistry.get_algorithm('osft')
 - `data_path` (str): Path to the training data (processed or unprocessed)
 - `output_dir` (str): Directory where outputs from training will be saved
 - `unfreeze_rank_ratio` (float): Controls the amount that each matrix is unfrozen during OSFT (0.0-1.0)
-- `batch_size` (int): Batch size for training
+- `effective_batch_size` (int): Batch size for training
 - `max_tokens_per_gpu` (int): Maximum number of tokens placed on a single GPU
 - `max_seq_len` (int): Maximum sequence length (in tokens) for training samples
 - `learning_rate` (float): Learning rate for model update size
@@ -165,7 +165,7 @@ OSFTAlgorithm = AlgorithmRegistry.get_algorithm('osft')
 - `unmask_messages` (bool): Whether to unmask messages during data processing
 
 **Core Training Parameters:**
-- `epochs` (int): Number of epochs to train for
+- `num_epochs` (int): Number of epochs to train for
 - `seed` (int): Random seed for training
 - `use_liger` (bool): Whether to use Liger kernels for training
 
@@ -200,7 +200,7 @@ try:
         data_path="/valid/data/path",
         output_dir="/valid/output/path",
         unfreeze_rank_ratio=0.3,
-        batch_size=8,
+        effective_batch_size=8,
         max_tokens_per_gpu=2048,
         max_seq_len=2048,
         learning_rate=2e-5
@@ -232,7 +232,7 @@ result = osft(
     data_path="/path/to/data.jsonl",
     output_dir="/path/to/outputs",
     unfreeze_rank_ratio=0.3,
-    batch_size=4,
+    effective_batch_size=4,
     max_tokens_per_gpu=2048,
     max_seq_len=2048,
     learning_rate=2e-5,
@@ -250,7 +250,7 @@ result = osft(
     data_path="/path/to/data.jsonl",
     output_dir="/path/to/outputs",
     unfreeze_rank_ratio=0.25,
-    batch_size=2,
+    effective_batch_size=2,
     max_tokens_per_gpu=1024,
     max_seq_len=2048,
     learning_rate=1e-5,
diff --git a/src/training_hub/algorithms/osft.py b/src/training_hub/algorithms/osft.py
@@ -1,7 +1,5 @@
 import os
-import shutil
-from typing import Literal, get_origin, get_args, Union
-from itertools import chain
+from typing import get_origin, get_args, Union
 from dataclasses import fields
 
 import datasets
@@ -28,7 +26,7 @@ def train(
         model_path: str,
         data_path: str,
         unfreeze_rank_ratio: float,
-        batch_size: int,
+        effective_batch_size: int,
         max_tokens_per_gpu: int,
         max_seq_len: int,
         learning_rate: float,
@@ -52,7 +50,7 @@ def train(
         save_final_checkpoint: bool | None = None,
         
         # parameters for the training mode
-        epochs: int | None = None,
+        num_epochs: int | None = None,
 
         # whether to use the processed dataset
         use_processed_dataset: bool | None = None,
@@ -87,7 +85,7 @@ def train(
             unfreeze_rank_ratio (float):
                 Controls the amount that each matrix is unfrozen during OSFT. 
                 Valid values are between 0.0 and 1.0.
-            batch_size (int): Batch size for training.
+            effective_batch_size (int): Effective batch size for training.
             max_tokens_per_gpu (int):
                 The maximum number of tokens placed on a single GPU for training.
                 When hitting OOMs, consider reducing this value.
@@ -109,7 +107,7 @@ def train(
             lr_scheduler_kwargs (dict[str, str]): Additional scheduler parameters.
             checkpoint_at_epoch (bool): Whether to checkpoint at each epoch.
             save_final_checkpoint (bool): Whether to save final checkpoint once training is complete.
-            epochs (int): Number of epochs to train for.
+            num_epochs (int): Number of epochs to train for.
             use_processed_dataset (bool):
                 Whether to use the processed dataset. If False, the data is assumed to be in standard
                 messages format witha `messages` and optional `unmask` field on each sample.
@@ -137,7 +135,7 @@ def train(
         required_params = {
             'model_path': model_path,
             'data_path': data_path,
-            'batch_size': batch_size,
+            'effective_batch_size': effective_batch_size,
             'max_tokens_per_gpu': max_tokens_per_gpu,
             'max_seq_len': max_seq_len,
             'learning_rate': learning_rate,
@@ -161,7 +159,7 @@ def train(
             'checkpoint_at_epoch': checkpoint_at_epoch,
             'save_final_checkpoint': save_final_checkpoint,
 
-            'epochs': epochs,
+            'num_epochs': num_epochs,
 
             'use_liger': use_liger, 
             'seed': seed,
@@ -196,7 +194,7 @@ def get_required_params(self) -> dict[str, type]:
             'model_path': str,
             'data_path': str,
             'unfreeze_rank_ratio': float,
-            'batch_size': int,
+            'effective_batch_size': int,
             'max_tokens_per_gpu': int,
             'max_seq_len': int,
             'learning_rate': float,
@@ -214,7 +212,7 @@ def get_optional_params(self) -> dict[str, type]:
             'lr_scheduler_kwargs': dict[str, str],
             'checkpoint_at_epoch': bool,
             'save_final_checkpoint': bool,
-            'epochs': int,
+            'num_epochs': int,
             'use_processed_dataset': bool,
             'unmask_messages': bool,
             'nproc_per_node': int,
@@ -320,7 +318,8 @@ def execute_training(self, algorithm_params: dict[str, any]) -> any:
             'target_patterns': 'osft_target_patterns',
             'unfreeze_rank_ratio': 'osft_unfreeze_rank_ratio',
             'model_path': 'model_name_or_path',
-            'epochs': 'max_epochs',
+            'num_epochs': 'max_epochs',
+            'effective_batch_size': 'batch_size',
         }
  
         # Rename parameters before sending to backend
@@ -346,11 +345,16 @@ def execute_training(self, algorithm_params: dict[str, any]) -> any:
         # adjust arguments to align with the API definition 
         training_args_pre = {k: v for k, v in algorithm_params.items() if k in training_args_fields and v is not None}
         training_args_pre['data_path'] = training_ready_data_path  # replaces raw data path with processed
-        
+
         # mini trainer can support multiple modes, but we don't expose this feature by default
         # to prevent the current API from becoming overly complicated
-        training_args_pre['training_mode'] = TrainingMode(training_args_pre.get('training_mode', 'epoch'))
-        training_args_pre['osft'] = True
+        if not isinstance(train_mode := training_args_pre.get('training_mode', TrainingMode.EPOCH), TrainingMode):
+            train_mode = TrainingMode(train_mode)
+        training_args_pre['training_mode'] = train_mode
+
+        # user may want to control this API field for debug purposes, so we allow for it to be read
+        # but default it to True
+        training_args_pre['osft'] = training_args_pre.get('osft', True)
 
         torchrun_args_pre = {k: v for k, v in algorithm_params.items() if k in torchrun_args_fields and v is not None}
 
@@ -419,7 +423,7 @@ def osft(
     data_path: str,
     output_dir: str,
     unfreeze_rank_ratio: float,
-    batch_size: int,
+    effective_batch_size: int,
     max_tokens_per_gpu: int,
     max_seq_len: int,
     learning_rate: float,
@@ -435,7 +439,7 @@ def osft(
     lr_scheduler_kwargs: dict[str, str] | None = None,
     checkpoint_at_epoch: bool | None = None,
     save_final_checkpoint: bool | None = None,
-    epochs: int | None = None,
+    num_epochs: int | None = None,
     # Torchrun parameters for multi-node support
     nproc_per_node: int | None = None,
     nnodes: int | None = None,
@@ -452,7 +456,7 @@ def osft(
         data_path=data_path,
         output_dir=output_dir,
         unfreeze_rank_ratio=unfreeze_rank_ratio,
-        batch_size=batch_size,
+        effective_batch_size=effective_batch_size,
         max_tokens_per_gpu=max_tokens_per_gpu,
         max_seq_len=max_seq_len,
         learning_rate=learning_rate,
@@ -466,7 +470,7 @@ def osft(
         lr_scheduler_kwargs=lr_scheduler_kwargs,
         checkpoint_at_epoch=checkpoint_at_epoch,
         save_final_checkpoint=save_final_checkpoint,
-        epochs=epochs,
+        num_epochs=num_epochs,
         nproc_per_node=nproc_per_node,
         nnodes=nnodes,
         node_rank=node_rank,
diff --git a/src/training_hub/utils.py b/src/training_hub/utils.py
@@ -1,5 +1,4 @@
 from typing import get_origin, get_args
-import sys
 
 def format_type_name(tp):
     # Handle None

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`from typing import get_origin, get_args`
`2`		`-import sys`
`3`	`2`
`4`	`3`	`def format_type_name(tp):`
`5`	`4`	`# Handle None`