Skip to content

Commit 58c87a3

Browse files
committed
First round coderabbit feedback
Signed-off-by: Mustafa Eyceoz <[email protected]>
1 parent 586b781 commit 58c87a3

File tree

3 files changed

+28
-24
lines changed

3 files changed

+28
-24
lines changed

pyproject.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ license = "Apache-2.0"
1212
requires-python = ">=3.11"
1313
dependencies = [
1414
"setuptools>=80.0",
15-
"packaging>=23.2",
15+
"packaging>=24.2",
1616
"wheel>=0.43",
1717
"instructlab-training>=0.12.1",
1818
"rhai-innovation-mini-trainer>=0.3.0",
@@ -63,21 +63,21 @@ cuda = [
6363
]
6464

6565
lora = [
66-
"unsloth>=2025.10.12",
66+
"unsloth>=2025.10.11",
6767
"trl>=0.18.0",
6868
# PyTorch's optimized xformers for CUDA 12.8 (compatible with CUDA 12.4+)
6969
"xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.33%2B5d4b92a5.d20251029-cp39-abi3-linux_x86_64.whl",
7070
]
7171

7272
lora-cu129 = [
73-
"unsloth>=2025.10.12",
73+
"unsloth>=2025.10.11",
7474
"trl>=0.18.0",
7575
# PyTorch's optimized xformers for CUDA 12.9
7676
"xformers @ https://download.pytorch.org/whl/cu129/xformers-0.0.33%2B5d4b92a5.d20251029-cp39-abi3-linux_x86_64.whl",
7777
]
7878

7979
lora-cu130 = [
80-
"unsloth>=2025.10.12",
80+
"unsloth>=2025.10.11",
8181
"trl>=0.18.0",
8282
# PyTorch's optimized xformers for CUDA 13.0+
8383
"xformers @ https://download.pytorch.org/whl/cu130/xformers-0.0.33%2B5d4b92a5.d20251029-cp39-abi3-linux_x86_64.whl",

src/training_hub/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from .algorithms import Algorithm, Backend, AlgorithmRegistry, create_algorithm
22
from .algorithms.sft import sft, SFTAlgorithm, InstructLabTrainingSFTBackend
33
from .algorithms.osft import OSFTAlgorithm, MiniTrainerOSFTBackend, osft
4-
from .algorithms.lora import lora_sft, LoRASFTAlgorithm, UnslothLoRABackend, AxolotlLoRABackend
4+
from .algorithms.lora import lora_sft, LoRASFTAlgorithm, UnslothLoRABackend
55
from .hub_core import welcome
66
from .profiling.memory_estimator import BasicEstimator, OSFTEstimatorExperimental, estimate, OSFTEstimator
77

@@ -19,7 +19,6 @@
1919
'MiniTrainerOSFTBackend',
2020
'LoRASFTAlgorithm',
2121
'UnslothLoRABackend',
22-
'AxolotlLoRABackend',
2322
'welcome',
2423
'BasicEstimator',
2524
'OSFTEstimatorExperimental',

src/training_hub/algorithms/lora.py

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,11 @@ def execute_training(self, algorithm_params: Dict[str, Any]) -> Any:
3636
"Install LoRA dependencies with: pip install 'training-hub[lora]'"
3737
) from e
3838

39-
# Separate torchrun parameters from training parameters
40-
torchrun_keys = {'nproc_per_node', 'nnodes', 'node_rank', 'rdzv_id', 'rdzv_endpoint', 'master_addr', 'master_port'}
41-
42-
# Extract torchrun parameters
43-
torchrun_params = {k: v for k, v in algorithm_params.items() if k in torchrun_keys}
44-
45-
# Extract training parameters (everything except torchrun params)
46-
training_params = {k: v for k, v in algorithm_params.items() if k not in torchrun_keys}
39+
# Use all parameters as training parameters
40+
# Note: Torchrun parameters (nproc_per_node, etc.) are handled by the torchrun launcher,
41+
# not by the Python training code. The training code auto-detects distributed environment
42+
# via environment variables (WORLD_SIZE, LOCAL_RANK, etc.) set by torchrun.
43+
training_params = algorithm_params
4744

4845
# Unsloth multi-GPU setup: Let Accelerate/torchrun handle distributed training
4946
# No custom distributed initialization needed - Unsloth works with standard PyTorch DDP
@@ -60,9 +57,6 @@ def execute_training(self, algorithm_params: Dict[str, Any]) -> Any:
6057
# Configure training arguments
6158
training_args = self._build_training_args(training_params)
6259

63-
# Determine dataset format and configure trainer accordingly
64-
dataset_type = training_params.get('dataset_type', 'chat_template')
65-
6660
# Use the same trainer configuration for all dataset types since we pre-process in _prepare_dataset
6761
trainer = SFTTrainer(
6862
model=model,
@@ -97,7 +91,6 @@ def _load_unsloth_model(self, params: Dict[str, Any]) -> tuple:
9791

9892
# Handle device placement for multi-GPU training
9993
device_map_config = {}
100-
import os
10194
if params.get('enable_model_splitting', False):
10295
# Use balanced device mapping for large models
10396
device_map_config['device_map'] = "balanced"
@@ -108,11 +101,22 @@ def _load_unsloth_model(self, params: Dict[str, Any]) -> tuple:
108101
torch.cuda.set_device(local_rank)
109102
device_map_config['device_map'] = {"": local_rank}
110103

104+
# Configure quantization options with BitsAndBytes defaults
105+
quantization_kwargs = {}
106+
if load_in_4bit:
107+
quantization_kwargs.update({
108+
'bnb_4bit_quant_type': params.get('bnb_4bit_quant_type', 'nf4'), # BitsAndBytes default
109+
'bnb_4bit_compute_dtype': params.get('bnb_4bit_compute_dtype', 'bfloat16'), # BitsAndBytes default
110+
'bnb_4bit_use_double_quant': params.get('bnb_4bit_use_double_quant', True), # BitsAndBytes default
111+
})
112+
111113
model, tokenizer = FastLanguageModel.from_pretrained(
112114
model_name=params['model_path'],
113115
max_seq_length=params.get('max_seq_len', 2048),
114116
dtype=None, # Auto-detect
115117
load_in_4bit=load_in_4bit,
118+
load_in_8bit=load_in_8bit,
119+
**quantization_kwargs,
116120
**device_map_config,
117121
# Additional Unsloth optimizations
118122
# trust_remote_code=params.get('trust_remote_code', False),
@@ -136,7 +140,7 @@ def _apply_lora_config(self, model, params: Dict[str, Any]):
136140
r=params.get('lora_r', 16),
137141
target_modules=target_modules,
138142
lora_alpha=params.get('lora_alpha', 32),
139-
lora_dropout=params.get('lora_dropout', 0.1),
143+
lora_dropout=params.get('lora_dropout', 0.0), # 0.0 is optimized for Unsloth
140144
bias="none",
141145
use_gradient_checkpointing="unsloth", # Unsloth's optimized gradient checkpointing
142146
random_state=params.get('seed', 3407),
@@ -205,7 +209,6 @@ def _build_training_args(self, params: Dict[str, Any]):
205209

206210
# Determine actual number of GPUs being used
207211
import torch
208-
import os
209212

210213
# If we're in a distributed environment, use world size
211214
if 'WORLD_SIZE' in os.environ:
@@ -380,7 +383,7 @@ def train(self,
380383
LoRA Parameters (from PEFT extender):
381384
lora_r: LoRA rank (default: 16)
382385
lora_alpha: LoRA alpha parameter (default: 32)
383-
lora_dropout: LoRA dropout rate (default: 0.1)
386+
lora_dropout: LoRA dropout rate (default: 0.0, optimized for Unsloth)
384387
target_modules: List of module names to apply LoRA to (default: auto-detect)
385388
use_rslora: Use Rank-Stabilized LoRA (default: False)
386389
use_dora: Use DoRA (Weight-Decomposed Low-Rank Adaptation) (default: False)
@@ -604,6 +607,8 @@ def get_optional_params(self) -> Dict[str, Type]:
604607
'field_output': str,
605608
# Multi-GPU model splitting
606609
'enable_model_splitting': bool,
610+
# Model saving
611+
'save_model': bool,
607612
}
608613

609614
# Combine all parameter types
@@ -689,17 +694,17 @@ def lora_sft(model_path: str,
689694
LoRA Parameters:
690695
lora_r: LoRA rank (default: 16)
691696
lora_alpha: LoRA alpha parameter (default: 32)
692-
lora_dropout: LoRA dropout rate (default: 0.1)
697+
lora_dropout: LoRA dropout rate (default: 0.0, optimized for Unsloth)
693698
target_modules: List of module names to apply LoRA to (default: auto-detect)
694699
695700
Training Parameters:
696701
num_epochs: Number of training epochs (default: 3)
697702
effective_batch_size: Effective batch size across all GPUs
698-
micro_batch_size: Batch size per GPU (default: 1)
703+
micro_batch_size: Batch size per GPU (default: 2)
699704
gradient_accumulation_steps: Steps to accumulate gradients (default: 1)
700705
learning_rate: Learning rate (default: 2e-4)
701706
max_seq_len: Maximum sequence length (default: 2048)
702-
lr_scheduler: Learning rate scheduler (default: 'cosine')
707+
lr_scheduler: Learning rate scheduler (default: 'linear')
703708
warmup_steps: Number of warmup steps (default: 10)
704709
705710
Quantization Parameters (QLoRA):

0 commit comments

Comments
 (0)