add osft

RobotSail · RobotSail · commit 10d564c5ade4 · 2025-08-22T07:15:00.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
     "packaging>=24.2",
     "wheel>=0.43",
     "instructlab-training>=0.11.1",
+    "rhai-innovation-mini-trainer @ git+https://github.com/Red-Hat-AI-Innovation-Team/mini_trainer.git@a22c81c52660b1304e8fbef8bef548fcae93aba0",  # TODO: update this once we have a release
     "torch>=2.6.0",
     "numba>=0.50",
     "datasets>=2.15.0",
@@ -50,6 +51,7 @@ dynamic = ["version"]
 [project.optional-dependencies]
 cuda = [
     "instructlab-training[cuda]>=0.11.1",
+    "rhai-innovation-mini-trainer[cuda] @ git+https://github.com/Red-Hat-AI-Innovation-Team/mini_trainer.git@a22c81c52660b1304e8fbef8bef548fcae93aba0",  # TODO: update this once we have a release
     "flash-attn>=2.8",
     "einops>=0.8"
 ]
diff --git a/src/training_hub/__init__.py b/src/training_hub/__init__.py
@@ -1,5 +1,6 @@
 from .algorithms import Algorithm, Backend, AlgorithmRegistry, create_algorithm
 from .algorithms.sft import sft, SFTAlgorithm, InstructLabTrainingSFTBackend
+from .algorithms.osft import OSFTAlgorithm, MiniTrainerOSFTBackend
 from .hub_core import welcome
 
 __all__ = [
@@ -10,5 +11,7 @@
     'sft',
     'SFTAlgorithm',
     'InstructLabTrainingSFTBackend',
+    'OSFTAlgorithm',
+    'MiniTrainerOSFTBackend',
     'welcome'
-]
+]
diff --git a/src/training_hub/algorithms/__init__.py b/src/training_hub/algorithms/__init__.py
@@ -16,6 +16,10 @@ def get_required_params(self) -> Dict[str, Type]:
         """Return dictionary of required parameter names and their types."""
         pass
 
+    @abstractmethod
+    def get_optional_params(self) -> Dict[str, Type]:
+        """Return dictionary of optional parameter names and their types."""
+        pass
 
 class Backend(ABC):
     """Base class for all backend implementations."""
@@ -89,4 +93,4 @@ def create_algorithm(algorithm_name: str, backend_name: str = None, **kwargs) ->
     backend_class = AlgorithmRegistry.get_backend(algorithm_name, backend_name)
     backend_instance = backend_class()
     
-    return algorithm_class(backend=backend_instance, **kwargs)
+    return algorithm_class(backend=backend_instance, **kwargs)
diff --git a/src/training_hub/algorithms/osft.py b/src/training_hub/algorithms/osft.py
@@ -0,0 +1,324 @@
+import os
+import shutil
+from typing import Literal, get_origin, get_args, Union
+from itertools import chain
+from dataclasses import fields
+
+import datasets
+from training_hub.algorithms import Algorithm, Backend, AlgorithmRegistry
+from training_hub.utils import format_type_name
+
+_AlgorithmParamsKeyLiteral = Literal['parameters', 'renames']
+
+class OSFTAlgorithm(Algorithm):
+    """Orthogonal Subspace Fine-Tuning algorithm."""
+
+    def __init__(self, backend: Backend, **kwargs) -> None:
+        self.backend = backend
+        self.kwargs = kwargs
+
+    def train(
+        self,
+        model_path: str,
+        data_path: str,
+        batch_size: int,
+        max_tokens_per_gpu: int,
+        max_seq_len: int,
+        learning_rate: float,
+        output_dir: str,
+        unfreeze_rank_ratio: float,
+
+        # patterns that we want to match against when selecting
+        # modules for OSFT
+        target_patterns: list[str] | None = None,  
+
+        # settings for training mode
+        seed: int | None = None,
+        use_liger: bool | None = None,
+        unmask_messages: bool | None = None,
+
+        # learning rate scheduler
+        lr_scheduler: str = None,
+        warmup_steps: int = None, 
+        lr_scheduler_kwargs: dict[str, str] | None = None,
+
+        # checkpointing
+        checkpoint_at_epoch: bool | None = None,
+        save_final_checkpoint: bool | None = None,
+        
+        # parameters for the training mode
+        epochs: int | None = None,
+
+        # Torchrun parameters for multi-node support
+        nproc_per_node: int | None = None,
+        nnodes: int | None = None,
+        node_rank: int | None = None,
+        rdzv_id: int | None = None,
+        rdzv_endpoint: str | None = None,
+        **kwargs,
+    ) -> any:
+        """Execute OSFT training using MiniTrainer."""
+
+        required_params = {
+            'model_path': model_path,
+            'data_path': data_path,
+            'batch_size': batch_size,
+            'max_tokens_per_gpu': max_tokens_per_gpu,
+            'max_seq_len': max_seq_len,
+            'learning_rate': learning_rate,
+            'output_dir': output_dir,
+            'unfreeze_rank_ratio': unfreeze_rank_ratio,
+        }
+
+        optional_params = {
+            'target_patterns': target_patterns,
+            
+            # for data processing
+            'unmask_messages': unmask_messages,
+            
+            # scheduler params
+            'lr_scheduler': lr_scheduler,
+            'lr_scheduler_kwargs': lr_scheduler_kwargs,
+            'warmup_steps': warmup_steps, 
+
+            # checkpointing settings
+            'checkpoint_at_epoch': checkpoint_at_epoch,
+            'save_final_checkpoint': save_final_checkpoint,
+
+            # mini trainer supports a few different modes, but we fix this one for simplicty
+            # another mode can be selected by overriding via kwargs
+            'training_mode': 'epoch',  
+            'epochs': epochs,
+
+            'use_liger': use_liger, 
+            'seed': seed,
+
+            # torchrun params
+            'nproc_per_node': nproc_per_node,
+            'nnodes': nnodes,
+            'node_rank': node_rank,
+            'rdzv_id': rdzv_id,
+            'rdzv_endpoint': rdzv_endpoint,
+        }
+
+        # data_params = {
+        #     'data_path': data_path,
+        #     'unmask_messages': unmask_messages,
+        #     # this should be something like `{output_dir}/_internal`, but we should
+        #     # delegate the responsibility for that onto the backend algorithm
+        #     # Also, we don't pass this to renames since this is also being used as-is in the
+        #     # main backend.
+        #     'data_output_path': output_dir  
+        # }
+
+        # we keep a separate mapping of which parameters will be renamed,
+        # so this function can make assertions about algorithm requirements
+        # while the backend can more easily use the original arguments without needing
+        # to re-map in several places
+        renames = {
+            'use_liger': 'use_liger_kernels',
+            'warmup_steps': 'num_warmup_steps',
+            'target_patterns': 'osft_target_patterns',
+            'unfreeze_rank_ratio': 'osft_unfreeze_rank_ratio',
+            'model_path': 'model_name_or_path',
+            'epochs': 'max_epochs',
+        }
+
+        # now do validation now that we've set everything up
+        for required_param in self.get_required_params().keys():
+            if required_param not in required_params:
+                raise ValueError(f"error: required parameter not provided: {required_param}")
+        
+        all_params = dict(
+            **required_params,
+            **optional_params,
+            **kwargs,
+        )
+        
+        # validate types of all parameters
+        self._validate_param_types(all_params)
+
+        # now we can build the algorithm params
+        algorithm_params = dict(
+            parameters=all_params,
+            renames=renames
+        )
+
+        return self.backend.execute_training(algorithm_params)
+        
+    def get_required_params(self) -> dict[str, type]:
+        """Return dictionary of required parameter names and their types."""
+        return {
+            'model_path': str,
+            'data_path': str,
+            'unfreeze_rank_ratio': float,
+            'batch_size': int,
+            'max_tokens_per_gpu': int,
+            'max_seq_len': int,
+            'learning_rate': float,
+            'output_dir': str,
+        }
+
+    def get_optional_params(self) -> dict[str, type]:
+        """Return dictionary of optional parameter names and their types."""
+        return {
+            'target_patterns': list[str],
+            'unmask_messages': bool,
+            'lr_scheduler': str,
+            'lr_scheduler_kwargs': dict[str, str],
+            'warmup_steps': int,
+            'checkpoint_at_epoch': bool,
+            'save_final_checkpoint': bool,
+            'training_mode': str,
+            'max_epochs': int,
+            'use_liger': bool,
+            'seed': int,
+            'nproc_per_node': int,
+            'nnodes': int,
+            'node_rank': int,
+            'rdzv_id': int,
+            'rdzv_endpoint': str,
+        }
+
+    def _validate_param_types(self, params: dict[str, any]):
+        """Type-check given parameters, handling modern Python typing constructs."""
+        required_param_types = self.get_required_params()
+        optional_param_types = self.get_optional_params()
+        all_param_types = {**required_param_types, **optional_param_types}
+        
+        for param, value in params.items():
+            # use 'any' here to handle the case when the param is not defined by
+            # either optional or required
+            param_type = all_param_types.get(param, any)
+
+            # allow optional params to be None
+            if param in optional_param_types and value is None:
+                continue  # None is allowed for optional params
+                
+            if not self._check_type(value, param_type):
+                err_msg = (
+                    f"error: param '{param}' received unexpected type, "
+                    f"expected '{format_type_name(param_type)}' but got '{format_type_name(type(value))}'"
+                )
+                raise ValueError(err_msg)
+    
+    def _check_type(self, value, expected_type) -> bool:
+        """Check if value matches expected_type, handling modern typing constructs."""
+        # Handle 'any' type (accepts anything)
+        if expected_type is any:
+            return True
+            
+        # Handle basic types that work with isinstance
+        try:
+            if isinstance(expected_type, type):
+                return isinstance(value, expected_type)
+        except TypeError:
+            pass  # Fall through to handle complex types
+        
+        # Handle parameterized generics and unions
+        origin = get_origin(expected_type)
+        args = get_args(expected_type)
+        
+        # Handle Union types (including X | None syntax)
+        if origin is Union:
+            return any(self._check_type(value, arg) for arg in args)
+        
+        # Handle list types
+        if origin is list:
+            if not isinstance(value, list):
+                return False
+            if args and value:  # Check element types if specified and list is not empty
+                element_type = args[0]
+                return all(self._check_type(item, element_type) for item in value)
+            return True
+            
+        # Handle dict types
+        if origin is dict:
+            if not isinstance(value, dict):
+                return False
+            if args and value:  # Check key/value types if specified and dict is not empty
+                key_type, val_type = args[0], args[1]
+                return all(
+                    self._check_type(k, key_type) and self._check_type(v, val_type)
+                    for k, v in value.items()
+                )
+            return True
+        
+        # Fallback for basic isinstance check
+        try:
+            return isinstance(value, expected_type)
+        except TypeError:
+            # If we can't check the type, assume it's valid
+            return True
+            
+    
+
+
+class MiniTrainerOSFTBackend(Backend):
+    """MiniTrainer backend for OSFT algorithm."""
+
+    def execute_training(self, algorithm_params: dict[_AlgorithmParamsKeyLiteral, dict[str, any]]) -> any:
+        """Execute OSFT training using MiniTrainer."""
+        from mini_trainer import run_training, TrainingArgs, TorchrunArgs, TrainingMode
+
+
+        # mini trainer doesn't do its own data processing, so we use the one from
+        # instructlab training
+        from instructlab.training.data_process import process_messages_into_input_ids
+
+
+        # first we need to process data
+        output_dir = algorithm_params['parameters']['output_dir']
+        data_output_path = os.path.join(output_dir, '_internal_data_processing')
+        os.makedirs(data_output_path, exist_ok=True)
+
+        # if we received unmask then we need to add that
+        training_params = algorithm_params['parameters']
+        processing_data_path = training_params['data_path']
+        unmask_messages = training_params.get('unmask_messages', False)
+        if unmask_messages:
+            ds = datasets.load_dataset(training_params['data_path'], split='train')
+            ds = ds.map(lambda _: { "unmask": True }) 
+            processing_data_path = os.path.join(data_output_path, 'intermediate_data.jsonl')
+            ds.to_json(processing_data_path)
+        
+            # now we process the data
+            process_messages_into_input_ids(
+                data_path=processing_data_path,
+                data_output_path=data_output_path,
+                model_path=training_params['model_path'],
+                max_seq_len=training_params['max_seq_len'],
+                num_cpu_procs=8,
+            )
+
+        # above function will save to this file, so we pass this to the trainer
+        processed_data_path = os.path.join(data_output_path, 'data.jsonl')
+
+        
+        # This section converts the parameters we get from the Algorithm into one which works
+        # for this backend (mini-trainer). Since the algorithm renames parameters for simplicity,
+        # we map each param back into its original name then place it into the correct dataclass.
+        renames = algorithm_params['renames']
+        training_params = {renames.get(k, k): v for k, v in algorithm_params['parameters'].items()}
+        torchrun_args_fields = {f.name for f in fields(TorchrunArgs)}
+        training_args_fields = {f.name for f in fields(TrainingArgs)}
+
+        # adjust arguments to align with the API definition 
+        training_args_pre = {k: v for k, v in training_params.items() if k in training_args_fields and v is not None}
+        training_args_pre['data_path'] = processed_data_path  # replaces raw data path with processed
+        training_args_pre['training_mode'] = TrainingMode(training_args_pre['training_mode'])
+        torchrun_args_pre = {k: v for k, v in training_params.items() if k in torchrun_args_fields and v is not None}
+
+        # now we run training
+        return run_training(
+            torch_args=TorchrunArgs(**torchrun_args_pre),
+            train_args=TrainingArgs(**training_args_pre),
+        )
+            
+
+        
+
+
+
+AlgorithmRegistry.register_algorithm('osft', OSFTAlgorithm)
+AlgorithmRegistry.register_backend('osft', 'mini-trainer', MiniTrainerOSFTBackend)
diff --git a/src/training_hub/algorithms/sft.py b/src/training_hub/algorithms/sft.py
@@ -152,6 +152,22 @@ def get_required_params(self) -> Dict[str, Type]:
             'max_batch_len': int,
         }
 
+    def get_optional_params(self) -> Dict[str, Type]:
+        """Return optional parameters for SFT."""
+        return {
+            'max_tokens_per_gpu': int,
+            'data_output_dir': str,
+            'save_samples': int,
+            'warmup_steps': int,
+            'accelerate_full_state_at_epoch': bool,
+            'checkpoint_at_epoch': bool,
+            'nproc_per_node': int,
+            'nnodes': int,
+            'node_rank': int,
+            'rdzv_id': int,
+            'rdzv_endpoint': str,
+        }
+
 
 # Register the algorithm and backend
 AlgorithmRegistry.register_algorithm('sft', SFTAlgorithm)
diff --git a/src/training_hub/utils.py b/src/training_hub/utils.py