-
Notifications
You must be signed in to change notification settings - Fork 18
Update dependencies, examples, and docs for GPT-OSS #6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
455a61f
9b8505b
40a85f3
58dadcf
1f158f8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,175 @@ | ||
| #!/usr/bin/env python3 | ||
| """ | ||
| OSFT Training Example: GPT-OSS 20B | ||
| This script demonstrates OSFT (Orthogonal Subspace Fine-Tuning) training with GPT-OSS 20B model | ||
| using a single-node, multi-GPU setup with training_hub. | ||
| OSFT allows continual training without catastrophic forgetting, making it ideal for: | ||
| - Adapting GPT-OSS 20B to specialized domains (medical, legal, technical) | ||
| - Adding new knowledge without degrading general capabilities | ||
| - Fine-tuning without complex replay mechanisms | ||
| Example usage: | ||
| python osft_gpt_oss_example.py \ | ||
| --data-path /path/to/data.jsonl \ | ||
| --ckpt-output-dir /path/to/checkpoints | ||
| """ | ||
|
|
||
| import os | ||
| import sys | ||
| import time | ||
| from datetime import datetime | ||
| import argparse | ||
| import glob | ||
|
|
||
| from training_hub import osft | ||
|
|
||
| def find_most_recent_checkpoint(output_dir): | ||
| """ | ||
| Find the most recent checkpoint in the training output directory. | ||
| Args: | ||
| output_dir (str): Training output directory containing hf_format/ subdirectory | ||
| Returns: | ||
| str: Path to the most recent checkpoint | ||
| Raises: | ||
| ValueError: If no checkpoints are found | ||
| """ | ||
| # Get all checkpoint directories under hf_format | ||
| checkpoint_pattern = os.path.join(output_dir, "hf_format", "samples_*.0") | ||
| checkpoint_dirs = glob.glob(checkpoint_pattern) | ||
|
|
||
| if not checkpoint_dirs: | ||
| raise ValueError(f"No checkpoints found in {os.path.join(output_dir, 'hf_format')}") | ||
|
|
||
| # Find the most recently created checkpoint | ||
| most_recent_checkpoint = max(checkpoint_dirs, key=os.path.getctime) | ||
|
|
||
| return most_recent_checkpoint | ||
|
|
||
|
|
||
| def main(): | ||
| parser = argparse.ArgumentParser(description='OSFT Training Example: GPT-OSS 20B') | ||
|
|
||
| # Required parameters | ||
| parser.add_argument('--data-path', required=True, | ||
| help='Path to training data (JSONL format)') | ||
| parser.add_argument('--ckpt-output-dir', required=True, | ||
| help='Directory to save checkpoints') | ||
|
|
||
| # Optional overrides | ||
| parser.add_argument('--model-path', default='openai/gpt-oss-20b', | ||
| help='Model path or HuggingFace name (default: openai/gpt-oss-20b)') | ||
| parser.add_argument('--num-epochs', type=int, default=3, | ||
| help='Number of epochs (default: 3)') | ||
| parser.add_argument('--unfreeze-rank-ratio', type=float, default=0.25, | ||
| help='Unfreeze rank ratio for OSFT (0.0-1.0, default: 0.25)') | ||
| parser.add_argument('--max-tokens-per-gpu', type=int, default=8192, | ||
| help='Max tokens per GPU (default: 8192 for GPT-OSS 20B)') | ||
| parser.add_argument('--nproc-per-node', type=int, default=8, | ||
| help='Number of GPUs (default: 8)') | ||
| parser.add_argument('--unmask-messages', action='store_true', default=False, | ||
| help='Unmask messages during training (default: False)') | ||
| parser.add_argument('--learning-rate', type=float, default=3e-6, | ||
| help='Learning rate for training (default: 3e-6)') | ||
|
|
||
| args = parser.parse_args() | ||
|
|
||
| # GPT-OSS 20B OSFT configuration | ||
| print("🚀 OSFT Training: GPT-OSS 20B") | ||
| print("=" * 50) | ||
| print(f"Model: {args.model_path}") | ||
| print(f"Data: {args.data_path}") | ||
| print(f"Output: {args.ckpt_output_dir}") | ||
| print(f"GPUs: {args.nproc_per_node}") | ||
| print(f"Unfreeze Rank Ratio: {args.unfreeze_rank_ratio}") | ||
| print(f"Max tokens per GPU: {args.max_tokens_per_gpu:,}") | ||
| print() | ||
| print("📝 OSFT Benefits for GPT-OSS 20B:") | ||
| print(" • Preserve GPT-OSS's strong general capabilities") | ||
| print(" • Add domain-specific knowledge efficiently") | ||
| print(" • No need for complex data mixing or replay buffers") | ||
| print(" • Leverage the high-quality 20B parameter base") | ||
| print() | ||
|
|
||
| # Training configuration optimized for GPT-OSS 20B with OSFT | ||
| start_time = time.time() | ||
|
|
||
| try: | ||
| osft_params = { | ||
| # Model and data | ||
| 'model_path': args.model_path, | ||
| 'data_path': args.data_path, | ||
| 'ckpt_output_dir': args.ckpt_output_dir, | ||
|
|
||
| # OSFT-specific parameters | ||
| 'unfreeze_rank_ratio': args.unfreeze_rank_ratio, # Conservative for 20B model | ||
|
|
||
| # Training parameters optimized for GPT-OSS 20B | ||
| 'num_epochs': args.num_epochs, | ||
| 'effective_batch_size': 32, # Smaller batch size for 20B model | ||
| 'learning_rate': args.learning_rate, # Lower LR for larger model | ||
| 'max_seq_len': 4096, # Conservative context length for memory | ||
| 'max_tokens_per_gpu': args.max_tokens_per_gpu, | ||
|
|
||
| # Data processing | ||
| 'data_output_dir': "/dev/shm", # Use RAM disk for speed | ||
| 'warmup_steps': 0, | ||
Maxusmusti marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| 'unmask_messages': args.unmask_messages, | ||
|
|
||
| # Optimization | ||
| 'use_liger': True, # Enable Liger kernels for efficiency | ||
| 'osft_memory_efficient_init': True, # Recommended for OOMs at model load time | ||
| 'seed': 42, | ||
| 'lr_scheduler': 'cosine', # Cosine scheduler works well with OSFT | ||
|
|
||
| # Checkpointing | ||
| 'checkpoint_at_epoch': True, | ||
| 'save_final_checkpoint': True, | ||
|
|
||
| # Single-node multi-GPU setup | ||
| 'nproc_per_node': args.nproc_per_node, | ||
| 'nnodes': 1, | ||
| 'node_rank': 0, | ||
| 'rdzv_id': 105, | ||
| 'rdzv_endpoint': "127.0.0.1:29500", | ||
| } | ||
|
|
||
|
|
||
| osft(**osft_params) | ||
|
|
||
| end_time = time.time() | ||
| duration = end_time - start_time | ||
|
|
||
| most_recent_checkpoint = find_most_recent_checkpoint(args.ckpt_output_dir) | ||
|
|
||
| print("=" * 50) | ||
| print("✅ OSFT Training completed successfully!") | ||
| print(f"⏱️ Duration: {duration/3600:.2f} hours") | ||
| print(f"📁 Checkpoints: {args.ckpt_output_dir}/hf_format") | ||
| print(f" Most recent checkpoint: {most_recent_checkpoint}") | ||
| print() | ||
| print("🎯 Your GPT-OSS 20B model has been successfully adapted!") | ||
| print(" The model now incorporates your domain-specific knowledge") | ||
| print(" while maintaining its original high-quality capabilities.") | ||
|
|
||
| except Exception as e: | ||
| end_time = time.time() | ||
| duration = end_time - start_time | ||
|
|
||
| print("=" * 50) | ||
| print(f"❌ Training failed after {duration/60:.1f} minutes") | ||
| print(f"Error: {e}") | ||
| print() | ||
| print("💡 Troubleshooting tips:") | ||
| print(" - Reduce --max-tokens-per-gpu if you see OOM errors") | ||
| print(" - For domain adaptation, try --unfreeze-rank-ratio between 0.2-0.3") | ||
| print(" - Consider reducing batch size further for memory constraints") | ||
| sys.exit(1) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,117 @@ | ||
| #!/usr/bin/env python3 | ||
| """ | ||
| SFT Training Example: GPT-OSS 20B | ||
|
|
||
| This script demonstrates SFT training with GPT-OSS 20B model from OpenAI | ||
| using a single-node, multi-GPU setup with training_hub. | ||
|
|
||
| GPT-OSS 20B is a high-quality open source model that provides excellent | ||
| performance for supervised fine-tuning tasks. | ||
|
|
||
| Example usage: | ||
| python sft_gpt_oss_example.py \ | ||
| --data-path /path/to/data.jsonl \ | ||
| --ckpt-output-dir /path/to/checkpoints | ||
| """ | ||
|
|
||
| import os | ||
| import sys | ||
| import time | ||
| from datetime import datetime | ||
| import argparse | ||
|
|
||
| from training_hub import sft | ||
|
|
||
|
|
||
| def main(): | ||
| parser = argparse.ArgumentParser(description='SFT Training Example: GPT-OSS 20B') | ||
|
|
||
| # Required parameters | ||
| parser.add_argument('--data-path', required=True, | ||
| help='Path to training data (JSONL format)') | ||
| parser.add_argument('--ckpt-output-dir', required=True, | ||
| help='Directory to save checkpoints') | ||
|
|
||
| # Optional overrides | ||
| parser.add_argument('--model-path', default='openai/gpt-oss-20b', | ||
| help='Model path or HuggingFace name (default: openai/gpt-oss-20b)') | ||
| parser.add_argument('--num-epochs', type=int, default=3, | ||
| help='Number of epochs (default: 3)') | ||
| parser.add_argument('--max-tokens-per-gpu', type=int, default=12000, | ||
| help='Max tokens per GPU (default: 12000 for 20B model)') | ||
| parser.add_argument('--nproc-per-node', type=int, default=8, | ||
| help='Number of GPUs (default: 8)') | ||
|
|
||
| args = parser.parse_args() | ||
|
|
||
| # GPT-OSS 20B configuration | ||
| print("🚀 SFT Training: GPT-OSS 20B") | ||
| print("=" * 50) | ||
| print(f"Model: {args.model_path}") | ||
| print(f"Data: {args.data_path}") | ||
| print(f"Output: {args.ckpt_output_dir}") | ||
| print(f"GPUs: {args.nproc_per_node}") | ||
| print(f"Max tokens per GPU: {args.max_tokens_per_gpu:,}") | ||
| print() | ||
|
|
||
| # Training configuration optimized for GPT-OSS 20B | ||
| start_time = time.time() | ||
|
|
||
| try: | ||
| result = sft( | ||
| # Model and data | ||
| model_path=args.model_path, | ||
| data_path=args.data_path, | ||
| ckpt_output_dir=args.ckpt_output_dir, | ||
|
|
||
| # Training parameters optimized for GPT-OSS 20B | ||
| num_epochs=args.num_epochs, | ||
| effective_batch_size=32, # Smaller batch size for 20B model | ||
| learning_rate=2e-5, # Conservative LR for larger model | ||
| max_seq_len=8192, # Standard context length | ||
| max_tokens_per_gpu=args.max_tokens_per_gpu, | ||
|
|
||
| # Data processing | ||
| data_output_dir="/dev/shm", # Use RAM disk for speed | ||
| warmup_steps=100, | ||
| save_samples=0, # 0 disables sample-based checkpointing, use epoch-based only | ||
|
|
||
| # Checkpointing | ||
| checkpoint_at_epoch=True, | ||
| accelerate_full_state_at_epoch=False, # Disable for smaller checkpoints (no auto-resumption) | ||
|
|
||
| # Single-node multi-GPU setup | ||
| nproc_per_node=args.nproc_per_node, | ||
| nnodes=1, | ||
| node_rank=0, | ||
| rdzv_id=104, | ||
| rdzv_endpoint="127.0.0.1:29500", | ||
| ) | ||
|
|
||
| end_time = time.time() | ||
| duration = end_time - start_time | ||
|
|
||
| print("=" * 50) | ||
| print("✅ Training completed successfully!") | ||
| print(f"⏱️ Duration: {duration/3600:.2f} hours") | ||
| print(f"📁 Checkpoints: {args.ckpt_output_dir}/hf_format/") | ||
| print() | ||
| print("🎯 Your GPT-OSS 20B model has been fine-tuned!") | ||
| print(" The model is now specialized for your specific task") | ||
| print(" while maintaining the high quality of the base model.") | ||
|
|
||
| except Exception as e: | ||
| end_time = time.time() | ||
| duration = end_time - start_time | ||
|
|
||
| print("=" * 50) | ||
| print(f"❌ Training failed after {duration/60:.1f} minutes") | ||
| print(f"Error: {e}") | ||
| print() | ||
| print("💡 Troubleshooting tips:") | ||
| print(" - Reduce --max-tokens-per-gpu if you see OOM errors") | ||
| sys.exit(1) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() |
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -14,12 +14,13 @@ dependencies = [ | |||||||||||
| "setuptools>=80.0", | ||||||||||||
| "packaging>=24.2", | ||||||||||||
| "wheel>=0.43", | ||||||||||||
| "instructlab-training>=0.11.1", | ||||||||||||
| "rhai-innovation-mini-trainer>=0.1.1", | ||||||||||||
| "instructlab-training>=0.12.0", | ||||||||||||
| "rhai-innovation-mini-trainer>=0.2.0", | ||||||||||||
| "torch>=2.6.0", | ||||||||||||
| "transformers>=4.55.0", | ||||||||||||
| "numba>=0.50", | ||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chainNumba/llvmlite/numpy compatibility risk. The trio - "numba>=0.50",
- "llvmlite>=0.42",
- "numpy>=1.26.4,<2.3",
+ # Align versions to avoid ABI conflicts (NumPy 2+ support)
+ "numba>=0.60,<0.61",
+ "llvmlite>=0.43,<0.44",
+ "numpy>=1.26.4,<2.3",Please validate against your CI matrix and the GPUs you target. Also applies to: 38-39 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== locate pyproject.toml ==="
fd -HI --hidden pyproject.toml || true
for f in $(fd -HI --hidden pyproject.toml || true); do
echo "---- $f ----"
nl -ba "$f" | sed -n '1,240p'
done
echo
echo "=== Search for numba/llvmlite/numpy occurrences across repo ==="
rg -n --hidden -S -C2 '\b(numba|llvmlite|numpy)\b' || trueLength of output: 110 🌐 Web query: 💡 Result: Here are the sources you requested and the key compatibility facts:
If you want, I can extract the full compatibility table (Numba release → supported Python, NumPy, llvmlite, LLVM) from the 0.60 docs into a CSV or formatted table. Citations:
🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== locate pyproject.toml ==="
fd -HI --hidden pyproject.toml || true
for f in $(fd -HI --hidden pyproject.toml || true); do
echo "---- $f ----"
sed -n '1,240p' "$f" || true
done
echo
echo "=== Search for numba/llvmlite/numpy occurrences across repo ==="
rg -n --hidden -S -C2 '\b(numba|llvmlite|numpy)\b' || true
echo
echo "=== Search for CI/workflows that reference numba/llvmlite/numpy or GPUs ==="
rg -n --hidden -S --glob '.github/**' 'numba|llvmlite|numpy|cuda|gpu|nvidia|rocm' || true
echo
echo "=== List .github/workflows files (first 240 lines) ==="
fd -HI --hidden .github/workflows || true
for f in $(fd -HI --hidden .github/workflows || true); do
echo "---- $f ----"
sed -n '1,240p' "$f" || true
doneLength of output: 3531 Align Numba/llvmlite with supported NumPy to avoid ABI/resolver conflicts. pyproject.toml currently has "numba>=0.50" (line 21) and "llvmlite>=0.42" (line 38) while NumPy is ">=1.26.4,<2.3". Numba releases tightly pair with specific llvmlite and NumPy ranges (e.g., Numba 0.60 ↔ llvmlite 0.43); loose/mismatched ranges risk resolver or ABI failures. Pin numba/llvmlite to a tested pairing or adjust NumPy bounds and validate on CI + target GPUs. - "numba>=0.50",
- "llvmlite>=0.42",
- "numpy>=1.26.4,<2.3",
+ # Align versions to avoid ABI conflicts (NumPy 2+ support)
+ "numba>=0.60,<0.61",
+ "llvmlite>=0.43,<0.44",
+ "numpy>=1.26.4,<2.3",pyproject.toml: lines 21, 38. 📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||
| "datasets>=2.15.0", | ||||||||||||
| "numpy>=1.26.4,<2.0.0", | ||||||||||||
| "datasets>=4.0.0", | ||||||||||||
| "numpy>=1.26.4,<2.3", | ||||||||||||
| "rich>=14.1.0", | ||||||||||||
| "peft>=0.15", | ||||||||||||
| "pydantic>=2.7.0", | ||||||||||||
|
|
@@ -43,27 +44,20 @@ dependencies = [ | |||||||||||
| "requests>=2.32.5", | ||||||||||||
| "attr>=0.3.2", | ||||||||||||
| "filelock>=3.19.1", | ||||||||||||
| "mpmath>=1.3.0" | ||||||||||||
| "mpmath>=1.3.0", | ||||||||||||
| "pytest>=8.0" | ||||||||||||
Maxusmusti marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||
| ] | ||||||||||||
|
|
||||||||||||
| dynamic = ["version"] | ||||||||||||
|
|
||||||||||||
| [project.optional-dependencies] | ||||||||||||
| cuda = [ | ||||||||||||
| "instructlab-training[cuda]>=0.11.1", | ||||||||||||
| "rhai-innovation-mini-trainer[cuda]>=0.1.1", | ||||||||||||
| "flash-attn>=2.8", | ||||||||||||
| "einops>=0.8" | ||||||||||||
| ] | ||||||||||||
|
|
||||||||||||
| gpt-oss = [ | ||||||||||||
| "transformers>=4.55.0", | ||||||||||||
| "instructlab-training[cuda]>=0.12.0", | ||||||||||||
| "rhai-innovation-mini-trainer[cuda]>=0.2.0", | ||||||||||||
| "flash-attn>=2.8", | ||||||||||||
| "einops>=0.8", | ||||||||||||
| "datasets>=4.0.0", | ||||||||||||
| "bitsandbytes==0.47.0", | ||||||||||||
| "pytest>=8.0", | ||||||||||||
| "kernels>=0.9.0" | ||||||||||||
| "kernels>=0.9.0", | ||||||||||||
| "bitsandbytes>=0.47.0", | ||||||||||||
| ] | ||||||||||||
|
|
||||||||||||
| dev = [ | ||||||||||||
|
|
||||||||||||
Uh oh!
There was an error while loading. Please reload this page.