First round coderabbit feedback

Maxusmusti · Maxusmusti · commit 58c87a3acacb · 2025-11-14T16:12:05.000-05:00
Signed-off-by: Mustafa Eyceoz &lt;meyceoz@redhat.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ license = "Apache-2.0"
 requires-python = ">=3.11"
 dependencies = [
     "setuptools>=80.0",
-    "packaging>=23.2",
+    "packaging>=24.2",
     "wheel>=0.43",
     "instructlab-training>=0.12.1",
     "rhai-innovation-mini-trainer>=0.3.0",
@@ -63,21 +63,21 @@ cuda = [
 ]
 
 lora = [
-    "unsloth>=2025.10.12",
+    "unsloth>=2025.10.11",
     "trl>=0.18.0",
     # PyTorch's optimized xformers for CUDA 12.8 (compatible with CUDA 12.4+)
     "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.33%2B5d4b92a5.d20251029-cp39-abi3-linux_x86_64.whl",
 ]
 
 lora-cu129 = [
-    "unsloth>=2025.10.12",
+    "unsloth>=2025.10.11",
     "trl>=0.18.0",
     # PyTorch's optimized xformers for CUDA 12.9
     "xformers @ https://download.pytorch.org/whl/cu129/xformers-0.0.33%2B5d4b92a5.d20251029-cp39-abi3-linux_x86_64.whl",
 ]
 
 lora-cu130 = [
-    "unsloth>=2025.10.12",
+    "unsloth>=2025.10.11",
     "trl>=0.18.0",
     # PyTorch's optimized xformers for CUDA 13.0+
     "xformers @ https://download.pytorch.org/whl/cu130/xformers-0.0.33%2B5d4b92a5.d20251029-cp39-abi3-linux_x86_64.whl",
diff --git a/src/training_hub/__init__.py b/src/training_hub/__init__.py
@@ -1,7 +1,7 @@
 from .algorithms import Algorithm, Backend, AlgorithmRegistry, create_algorithm
 from .algorithms.sft import sft, SFTAlgorithm, InstructLabTrainingSFTBackend
 from .algorithms.osft import OSFTAlgorithm, MiniTrainerOSFTBackend, osft
-from .algorithms.lora import lora_sft, LoRASFTAlgorithm, UnslothLoRABackend, AxolotlLoRABackend
+from .algorithms.lora import lora_sft, LoRASFTAlgorithm, UnslothLoRABackend
 from .hub_core import welcome
 from .profiling.memory_estimator import BasicEstimator, OSFTEstimatorExperimental, estimate, OSFTEstimator
 
@@ -19,7 +19,6 @@
     'MiniTrainerOSFTBackend',
     'LoRASFTAlgorithm',
     'UnslothLoRABackend',
-    'AxolotlLoRABackend',
     'welcome',
     'BasicEstimator',
     'OSFTEstimatorExperimental',
diff --git a/src/training_hub/algorithms/lora.py b/src/training_hub/algorithms/lora.py
@@ -36,14 +36,11 @@ def execute_training(self, algorithm_params: Dict[str, Any]) -> Any:
                     "Install LoRA dependencies with: pip install 'training-hub[lora]'"
                 ) from e
 
-        # Separate torchrun parameters from training parameters
-        torchrun_keys = {'nproc_per_node', 'nnodes', 'node_rank', 'rdzv_id', 'rdzv_endpoint', 'master_addr', 'master_port'}
-
-        # Extract torchrun parameters
-        torchrun_params = {k: v for k, v in algorithm_params.items() if k in torchrun_keys}
-
-        # Extract training parameters (everything except torchrun params)
-        training_params = {k: v for k, v in algorithm_params.items() if k not in torchrun_keys}
+        # Use all parameters as training parameters
+        # Note: Torchrun parameters (nproc_per_node, etc.) are handled by the torchrun launcher,
+        # not by the Python training code. The training code auto-detects distributed environment
+        # via environment variables (WORLD_SIZE, LOCAL_RANK, etc.) set by torchrun.
+        training_params = algorithm_params
 
         # Unsloth multi-GPU setup: Let Accelerate/torchrun handle distributed training
         # No custom distributed initialization needed - Unsloth works with standard PyTorch DDP
@@ -60,9 +57,6 @@ def execute_training(self, algorithm_params: Dict[str, Any]) -> Any:
         # Configure training arguments
         training_args = self._build_training_args(training_params)
 
-        # Determine dataset format and configure trainer accordingly
-        dataset_type = training_params.get('dataset_type', 'chat_template')
-
         # Use the same trainer configuration for all dataset types since we pre-process in _prepare_dataset
         trainer = SFTTrainer(
             model=model,
@@ -97,7 +91,6 @@ def _load_unsloth_model(self, params: Dict[str, Any]) -> tuple:
 
         # Handle device placement for multi-GPU training
         device_map_config = {}
-        import os
         if params.get('enable_model_splitting', False):
             # Use balanced device mapping for large models
             device_map_config['device_map'] = "balanced"
@@ -108,11 +101,22 @@ def _load_unsloth_model(self, params: Dict[str, Any]) -> tuple:
             torch.cuda.set_device(local_rank)
             device_map_config['device_map'] = {"": local_rank}
 
+        # Configure quantization options with BitsAndBytes defaults
+        quantization_kwargs = {}
+        if load_in_4bit:
+            quantization_kwargs.update({
+                'bnb_4bit_quant_type': params.get('bnb_4bit_quant_type', 'nf4'),  # BitsAndBytes default
+                'bnb_4bit_compute_dtype': params.get('bnb_4bit_compute_dtype', 'bfloat16'),  # BitsAndBytes default
+                'bnb_4bit_use_double_quant': params.get('bnb_4bit_use_double_quant', True),  # BitsAndBytes default
+            })
+
         model, tokenizer = FastLanguageModel.from_pretrained(
             model_name=params['model_path'],
             max_seq_length=params.get('max_seq_len', 2048),
             dtype=None,  # Auto-detect
             load_in_4bit=load_in_4bit,
+            load_in_8bit=load_in_8bit,
+            **quantization_kwargs,
             **device_map_config,
             # Additional Unsloth optimizations
             # trust_remote_code=params.get('trust_remote_code', False),
@@ -136,7 +140,7 @@ def _apply_lora_config(self, model, params: Dict[str, Any]):
             r=params.get('lora_r', 16),
             target_modules=target_modules,
             lora_alpha=params.get('lora_alpha', 32),
-            lora_dropout=params.get('lora_dropout', 0.1),
+            lora_dropout=params.get('lora_dropout', 0.0),  # 0.0 is optimized for Unsloth
             bias="none",
             use_gradient_checkpointing="unsloth",  # Unsloth's optimized gradient checkpointing
             random_state=params.get('seed', 3407),
@@ -205,7 +209,6 @@ def _build_training_args(self, params: Dict[str, Any]):
 
         # Determine actual number of GPUs being used
         import torch
-        import os
 
         # If we're in a distributed environment, use world size
         if 'WORLD_SIZE' in os.environ:
@@ -380,7 +383,7 @@ def train(self,
             LoRA Parameters (from PEFT extender):
             lora_r: LoRA rank (default: 16)
             lora_alpha: LoRA alpha parameter (default: 32)
-            lora_dropout: LoRA dropout rate (default: 0.1)
+            lora_dropout: LoRA dropout rate (default: 0.0, optimized for Unsloth)
             target_modules: List of module names to apply LoRA to (default: auto-detect)
             use_rslora: Use Rank-Stabilized LoRA (default: False)
             use_dora: Use DoRA (Weight-Decomposed Low-Rank Adaptation) (default: False)
@@ -604,6 +607,8 @@ def get_optional_params(self) -> Dict[str, Type]:
             'field_output': str,
             # Multi-GPU model splitting
             'enable_model_splitting': bool,
+            # Model saving
+            'save_model': bool,
         }
 
         # Combine all parameter types
@@ -689,17 +694,17 @@ def lora_sft(model_path: str,
         LoRA Parameters:
         lora_r: LoRA rank (default: 16)
         lora_alpha: LoRA alpha parameter (default: 32)
-        lora_dropout: LoRA dropout rate (default: 0.1)
+        lora_dropout: LoRA dropout rate (default: 0.0, optimized for Unsloth)
         target_modules: List of module names to apply LoRA to (default: auto-detect)
 
         Training Parameters:
         num_epochs: Number of training epochs (default: 3)
         effective_batch_size: Effective batch size across all GPUs
-        micro_batch_size: Batch size per GPU (default: 1)
+        micro_batch_size: Batch size per GPU (default: 2)
         gradient_accumulation_steps: Steps to accumulate gradients (default: 1)
         learning_rate: Learning rate (default: 2e-4)
         max_seq_len: Maximum sequence length (default: 2048)
-        lr_scheduler: Learning rate scheduler (default: 'cosine')
+        lr_scheduler: Learning rate scheduler (default: 'linear')
         warmup_steps: Number of warmup steps (default: 10)
 
         Quantization Parameters (QLoRA):