Red-Hat-AI-Innovation-Team · Maxusmusti · Sep 17, 2025 · Sep 17, 2025 · Sep 17, 2025 · Sep 17, 2025
diff --git a/examples/README.md b/examples/README.md
@@ -24,8 +24,9 @@ The SFT algorithm supports training language models on supervised datasets with
 **Scripts:**
 - [LAB Multi-Phase Training Script](scripts/lab_multiphase_training.py) - Example script for LAB multi-phase training with full command-line interface
 - [SFT with Qwen 2.5 7B](scripts/sft_qwen_example.py) - Single-node multi-GPU training example with Qwen 2.5 7B Instruct
-- [SFT with Llama 3.1 8B](scripts/sft_llama_example.py) - Single-node multi-GPU training example with Llama 3.1 8B Instruct  
+- [SFT with Llama 3.1 8B](scripts/sft_llama_example.py) - Single-node multi-GPU training example with Llama 3.1 8B Instruct
 - [SFT with Phi 4 Mini](scripts/sft_phi_example.py) - Single-node multi-GPU training example with Phi 4 Mini Instruct
+- [SFT with GPT-OSS 20B](scripts/sft_gpt_oss_example.py) - Single-node multi-GPU training example with GPT-OSS 20B
 
 **Quick Example:**
 ```python
@@ -58,6 +59,7 @@ The OSFT algorithm supports continual training of pre-trained or instruction-tun
 - [OSFT with Qwen 2.5 7B](scripts/osft_qwen_example.py) - Single-node multi-GPU training example with Qwen 2.5 7B Instruct
 - [OSFT with Llama 3.1 8B](scripts/osft_llama_example.py) - Single-node multi-GPU training example with Llama 3.1 8B Instruct
 - [OSFT with Phi 4 Mini](scripts/osft_phi_example.py) - Single-node multi-GPU training example with Phi 4 Mini Instruct
+- [OSFT with GPT-OSS 20B](scripts/osft_gpt_oss_example.py) - Single-node multi-GPU training example with GPT-OSS 20B
 - [OSFT Continual Learning Example](scripts/osft_continual_learning_example.py) - Example script demonstrating continual learning without catastrophic forgetting
 
 **Quick Example:**

diff --git a/examples/docs/osft_usage.md b/examples/docs/osft_usage.md
@@ -90,6 +90,7 @@ result = osft(
     num_epochs=3,
     warmup_steps=100,
     use_liger=True,
+    osft_memory_efficient_init=True,  # Recommended for OOMs at model load time
     seed=42
 )
 ```
@@ -168,6 +169,7 @@ OSFTAlgorithm = AlgorithmRegistry.get_algorithm('osft')
 - `num_epochs` (int): Number of epochs to train for
 - `seed` (int): Random seed for training
 - `use_liger` (bool): Whether to use Liger kernels for training
+- `osft_memory_efficient_init` (bool): Enable memory-efficient initialization to reduce memory usage during model loading (recommended for OOMs)
 
 **Learning Rate Scheduler:**
 - `lr_scheduler` (str): Name of the PyTorch learning rate scheduler to use
@@ -266,7 +268,7 @@ result = osft(
 
 1. **unfreeze_rank_ratio**: Start with values between 0.1-0.5. Values >0.5 are rarely needed for general continual-learning regimes.
 
-2. **Memory Management**: OSFT doesn't reduce memory requirements compared to SFT, so adjust `max_tokens_per_gpu` accordingly.
+2. **Memory Management**: OSFT doesn't reduce memory requirements compared to SFT, so adjust `max_tokens_per_gpu` accordingly. For memory-constrained environments or OOMs during model loading, set `osft_memory_efficient_init=True`.
 
 3. **Data Processing**: The algorithm handles data processing automatically. Use `use_processed_dataset=True` only if you have pre-tokenized data.
 

diff --git a/examples/scripts/osft_gpt_oss_example.py b/examples/scripts/osft_gpt_oss_example.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""
+OSFT Training Example: GPT-OSS 20B
+
+This script demonstrates OSFT (Orthogonal Subspace Fine-Tuning) training with GPT-OSS 20B model
+using a single-node, multi-GPU setup with training_hub.
+
+OSFT allows continual training without catastrophic forgetting, making it ideal for:
+- Adapting GPT-OSS 20B to specialized domains (medical, legal, technical)
+- Adding new knowledge without degrading general capabilities
+- Fine-tuning without complex replay mechanisms
+
+Example usage:
+    python osft_gpt_oss_example.py \
+        --data-path /path/to/data.jsonl \
+        --ckpt-output-dir /path/to/checkpoints
+"""
+
+import os
+import sys
+import time
+from datetime import datetime
+import argparse
+import glob
+
+from training_hub import osft
+
+def find_most_recent_checkpoint(output_dir):
+    """
+    Find the most recent checkpoint in the training output directory.
+
+    Args:
+        output_dir (str): Training output directory containing hf_format/ subdirectory
+
+    Returns:
+        str: Path to the most recent checkpoint
+
+    Raises:
+        ValueError: If no checkpoints are found
+    """
+    # Get all checkpoint directories under hf_format
+    checkpoint_pattern = os.path.join(output_dir, "hf_format", "samples_*.0")
+    checkpoint_dirs = glob.glob(checkpoint_pattern)
+
+    if not checkpoint_dirs:
+        raise ValueError(f"No checkpoints found in {os.path.join(output_dir, 'hf_format')}")
+
+    # Find the most recently created checkpoint
+    most_recent_checkpoint = max(checkpoint_dirs, key=os.path.getctime)
+
+    return most_recent_checkpoint
+
+
+def main():
+    parser = argparse.ArgumentParser(description='OSFT Training Example: GPT-OSS 20B')
+
+    # Required parameters
+    parser.add_argument('--data-path', required=True,
+                       help='Path to training data (JSONL format)')
+    parser.add_argument('--ckpt-output-dir', required=True,
+                       help='Directory to save checkpoints')
+
+    # Optional overrides
+    parser.add_argument('--model-path', default='openai/gpt-oss-20b',
+                       help='Model path or HuggingFace name (default: openai/gpt-oss-20b)')
+    parser.add_argument('--num-epochs', type=int, default=3,
+                       help='Number of epochs (default: 3)')
+    parser.add_argument('--unfreeze-rank-ratio', type=float, default=0.25,
+                       help='Unfreeze rank ratio for OSFT (0.0-1.0, default: 0.25)')
+    parser.add_argument('--max-tokens-per-gpu', type=int, default=8192,
+                       help='Max tokens per GPU (default: 8192 for GPT-OSS 20B)')
+    parser.add_argument('--nproc-per-node', type=int, default=8,
+                       help='Number of GPUs (default: 8)')
+    parser.add_argument('--unmask-messages', action='store_true', default=False,
+                       help='Unmask messages during training (default: False)')
+    parser.add_argument('--learning-rate', type=float, default=3e-6,
+                       help='Learning rate for training (default: 3e-6)')
+
+    args = parser.parse_args()
+
+    # GPT-OSS 20B OSFT configuration
+    print("🚀 OSFT Training: GPT-OSS 20B")
+    print("=" * 50)
+    print(f"Model: {args.model_path}")
+    print(f"Data: {args.data_path}")
+    print(f"Output: {args.ckpt_output_dir}")
+    print(f"GPUs: {args.nproc_per_node}")
+    print(f"Unfreeze Rank Ratio: {args.unfreeze_rank_ratio}")
+    print(f"Max tokens per GPU: {args.max_tokens_per_gpu:,}")
+    print()
+    print("📝 OSFT Benefits for GPT-OSS 20B:")
+    print("   • Preserve GPT-OSS's strong general capabilities")
+    print("   • Add domain-specific knowledge efficiently")
+    print("   • No need for complex data mixing or replay buffers")
+    print("   • Leverage the high-quality 20B parameter base")
+    print()
+
+    # Training configuration optimized for GPT-OSS 20B with OSFT
+    start_time = time.time()
+
+    try:
+        osft_params = {
+            # Model and data
+            'model_path': args.model_path,
+            'data_path': args.data_path,
+            'ckpt_output_dir': args.ckpt_output_dir,
+
+            # OSFT-specific parameters
+            'unfreeze_rank_ratio': args.unfreeze_rank_ratio,  # Conservative for 20B model
+
+            # Training parameters optimized for GPT-OSS 20B
+            'num_epochs': args.num_epochs,
+            'effective_batch_size': 32,         # Smaller batch size for 20B model
+            'learning_rate': args.learning_rate,             # Lower LR for larger model
+            'max_seq_len': 4096,                # Conservative context length for memory
+            'max_tokens_per_gpu': args.max_tokens_per_gpu,
+
+            # Data processing
+            'data_output_dir': "/dev/shm",      # Use RAM disk for speed
+            'warmup_steps': 0,
+            'unmask_messages': args.unmask_messages,
+
+            # Optimization
+            'use_liger': True,                   # Enable Liger kernels for efficiency
+            'osft_memory_efficient_init': True,  # Recommended for OOMs at model load time
+            'seed': 42,
+            'lr_scheduler': 'cosine',           # Cosine scheduler works well with OSFT
+
+            # Checkpointing
+            'checkpoint_at_epoch': True,
+            'save_final_checkpoint': True,
+
+            # Single-node multi-GPU setup
+            'nproc_per_node': args.nproc_per_node,
+            'nnodes': 1,
+            'node_rank': 0,
+            'rdzv_id': 105,
+            'rdzv_endpoint': "127.0.0.1:29500",
+        }
+
+
+        osft(**osft_params)
+
+        end_time = time.time()
+        duration = end_time - start_time
+
+        most_recent_checkpoint = find_most_recent_checkpoint(args.ckpt_output_dir)
+
+        print("=" * 50)
+        print("✅ OSFT Training completed successfully!")
+        print(f"⏱️  Duration: {duration/3600:.2f} hours")
+        print(f"📁 Checkpoints: {args.ckpt_output_dir}/hf_format")
+        print(f"   Most recent checkpoint: {most_recent_checkpoint}")
+        print()
+        print("🎯 Your GPT-OSS 20B model has been successfully adapted!")
+        print("   The model now incorporates your domain-specific knowledge")
+        print("   while maintaining its original high-quality capabilities.")
+
+    except Exception as e:
+        end_time = time.time()
+        duration = end_time - start_time
+
+        print("=" * 50)
+        print(f"❌ Training failed after {duration/60:.1f} minutes")
+        print(f"Error: {e}")
+        print()
+        print("💡 Troubleshooting tips:")
+        print("   - Reduce --max-tokens-per-gpu if you see OOM errors")
+        print("   - For domain adaptation, try --unfreeze-rank-ratio between 0.2-0.3")
+        print("   - Consider reducing batch size further for memory constraints")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/scripts/sft_gpt_oss_example.py b/examples/scripts/sft_gpt_oss_example.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""
+SFT Training Example: GPT-OSS 20B
+
+This script demonstrates SFT training with GPT-OSS 20B model from OpenAI
+using a single-node, multi-GPU setup with training_hub.
+
+GPT-OSS 20B is a high-quality open source model that provides excellent
+performance for supervised fine-tuning tasks.
+
+Example usage:
+    python sft_gpt_oss_example.py \
+        --data-path /path/to/data.jsonl \
+        --ckpt-output-dir /path/to/checkpoints
+"""
+
+import os
+import sys
+import time
+from datetime import datetime
+import argparse
+
+from training_hub import sft
+
+
+def main():
+    parser = argparse.ArgumentParser(description='SFT Training Example: GPT-OSS 20B')
+
+    # Required parameters
+    parser.add_argument('--data-path', required=True,
+                       help='Path to training data (JSONL format)')
+    parser.add_argument('--ckpt-output-dir', required=True,
+                       help='Directory to save checkpoints')
+
+    # Optional overrides
+    parser.add_argument('--model-path', default='openai/gpt-oss-20b',
+                       help='Model path or HuggingFace name (default: openai/gpt-oss-20b)')
+    parser.add_argument('--num-epochs', type=int, default=3,
+                       help='Number of epochs (default: 3)')
+    parser.add_argument('--max-tokens-per-gpu', type=int, default=12000,
+                       help='Max tokens per GPU (default: 12000 for 20B model)')
+    parser.add_argument('--nproc-per-node', type=int, default=8,
+                       help='Number of GPUs (default: 8)')
+
+    args = parser.parse_args()
+
+    # GPT-OSS 20B configuration
+    print("🚀 SFT Training: GPT-OSS 20B")
+    print("=" * 50)
+    print(f"Model: {args.model_path}")
+    print(f"Data: {args.data_path}")
+    print(f"Output: {args.ckpt_output_dir}")
+    print(f"GPUs: {args.nproc_per_node}")
+    print(f"Max tokens per GPU: {args.max_tokens_per_gpu:,}")
+    print()
+
+    # Training configuration optimized for GPT-OSS 20B
+    start_time = time.time()
+
+    try:
+        result = sft(
+            # Model and data
+            model_path=args.model_path,
+            data_path=args.data_path,
+            ckpt_output_dir=args.ckpt_output_dir,
+
+            # Training parameters optimized for GPT-OSS 20B
+            num_epochs=args.num_epochs,
+            effective_batch_size=32,            # Smaller batch size for 20B model
+            learning_rate=2e-5,                # Conservative LR for larger model
+            max_seq_len=8192,                  # Standard context length
+            max_tokens_per_gpu=args.max_tokens_per_gpu,
+
+            # Data processing
+            data_output_dir="/dev/shm",        # Use RAM disk for speed
+            warmup_steps=100,
+            save_samples=0,                    # 0 disables sample-based checkpointing, use epoch-based only
+
+            # Checkpointing
+            checkpoint_at_epoch=True,
+            accelerate_full_state_at_epoch=False, # Disable for smaller checkpoints (no auto-resumption)
+
+            # Single-node multi-GPU setup
+            nproc_per_node=args.nproc_per_node,
+            nnodes=1,
+            node_rank=0,
+            rdzv_id=104,
+            rdzv_endpoint="127.0.0.1:29500",
+        )
+
+        end_time = time.time()
+        duration = end_time - start_time
+
+        print("=" * 50)
+        print("✅ Training completed successfully!")
+        print(f"⏱️  Duration: {duration/3600:.2f} hours")
+        print(f"📁 Checkpoints: {args.ckpt_output_dir}/hf_format/")
+        print()
+        print("🎯 Your GPT-OSS 20B model has been fine-tuned!")
+        print("   The model is now specialized for your specific task")
+        print("   while maintaining the high quality of the base model.")
+
+    except Exception as e:
+        end_time = time.time()
+        duration = end_time - start_time
+
+        print("=" * 50)
+        print(f"❌ Training failed after {duration/60:.1f} minutes")
+        print(f"Error: {e}")
+        print()
+        print("💡 Troubleshooting tips:")
+        print("   - Reduce --max-tokens-per-gpu if you see OOM errors")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,12 +14,13 @@ dependencies = [
     "setuptools>=80.0",
     "packaging>=24.2",
     "wheel>=0.43",
-    "instructlab-training>=0.11.1",
-    "rhai-innovation-mini-trainer>=0.1.1",
+    "instructlab-training>=0.12.0",
+    "rhai-innovation-mini-trainer>=0.2.0",
     "torch>=2.6.0",
+    "transformers>=4.55.0",
     "numba>=0.50",
-    "numba>=0.50",
+    # Align versions to avoid ABI conflicts (NumPy 2+ support)
+    "numba>=0.60,<0.61",
+    "llvmlite>=0.43,<0.44",
+    "numpy>=1.26.4,<2.3",
-    "numba>=0.50",
+    # Align versions to avoid ABI conflicts (NumPy 2+ support)
+    "numba>=0.60,<0.61",
+    "llvmlite>=0.43,<0.44",
+    "numpy>=1.26.4,<2.3",
-    "datasets>=2.15.0",
-    "numpy>=1.26.4,<2.0.0",
+    "datasets>=4.0.0",
+    "numpy>=1.26.4,<2.3",
     "rich>=14.1.0",
     "peft>=0.15",
     "pydantic>=2.7.0",
@@ -43,27 +44,20 @@ dependencies = [
     "requests>=2.32.5",
     "attr>=0.3.2",
     "filelock>=3.19.1",
-    "mpmath>=1.3.0"
+    "mpmath>=1.3.0",
+    "pytest>=8.0"
 ]
 
 dynamic = ["version"]
 
 [project.optional-dependencies]
 cuda = [
-    "instructlab-training[cuda]>=0.11.1",
-    "rhai-innovation-mini-trainer[cuda]>=0.1.1",
-    "flash-attn>=2.8",
-    "einops>=0.8"
-]
-
-gpt-oss = [
-    "transformers>=4.55.0",
+    "instructlab-training[cuda]>=0.12.0",
+    "rhai-innovation-mini-trainer[cuda]>=0.2.0",
     "flash-attn>=2.8",
     "einops>=0.8",
-    "datasets>=4.0.0",
-    "bitsandbytes==0.47.0",
-    "pytest>=8.0",
-    "kernels>=0.9.0"
+    "kernels>=0.9.0",
+    "bitsandbytes>=0.47.0",
 ]
 
 dev = [